aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/block/rbd.c305
-rw-r--r--fs/ceph/addr.c214
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c51
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/dir.c376
-rw-r--r--fs/ceph/file.c89
-rw-r--r--fs/ceph/inode.c159
-rw-r--r--fs/ceph/ioctl.c14
-rw-r--r--fs/ceph/mds_client.c140
-rw-r--r--fs/ceph/mds_client.h17
-rw-r--r--fs/ceph/mdsmap.c43
-rw-r--r--fs/ceph/super.c47
-rw-r--r--fs/ceph/super.h12
-rw-r--r--fs/ceph/xattr.c25
-rw-r--r--include/linux/ceph/ceph_frag.h4
-rw-r--r--include/linux/ceph/ceph_fs.h20
-rw-r--r--include/linux/ceph/decode.h2
-rw-r--r--include/linux/ceph/libceph.h57
-rw-r--r--include/linux/ceph/mon_client.h23
-rw-r--r--include/linux/ceph/osd_client.h231
-rw-r--r--include/linux/ceph/osdmap.h158
-rw-r--r--include/linux/ceph/rados.h34
-rw-r--r--net/ceph/ceph_common.c2
-rw-r--r--net/ceph/ceph_strings.c16
-rw-r--r--net/ceph/debugfs.c147
-rw-r--r--net/ceph/mon_client.c393
-rw-r--r--net/ceph/osd_client.c4032
-rw-r--r--net/ceph/osdmap.c651
29 files changed, 4758 insertions, 2508 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0ede6d7e2568..81666a56415e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -350,12 +350,12 @@ struct rbd_device {
350 struct rbd_spec *spec; 350 struct rbd_spec *spec;
351 struct rbd_options *opts; 351 struct rbd_options *opts;
352 352
353 char *header_name; 353 struct ceph_object_id header_oid;
354 struct ceph_object_locator header_oloc;
354 355
355 struct ceph_file_layout layout; 356 struct ceph_file_layout layout;
356 357
357 struct ceph_osd_event *watch_event; 358 struct ceph_osd_linger_request *watch_handle;
358 struct rbd_obj_request *watch_request;
359 359
360 struct rbd_spec *parent_spec; 360 struct rbd_spec *parent_spec;
361 u64 parent_overlap; 361 u64 parent_overlap;
@@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1596 return __rbd_obj_request_wait(obj_request, 0); 1596 return __rbd_obj_request_wait(obj_request, 0);
1597} 1597}
1598 1598
1599static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
1600 unsigned long timeout)
1601{
1602 return __rbd_obj_request_wait(obj_request, timeout);
1603}
1604
1605static void rbd_img_request_complete(struct rbd_img_request *img_request) 1599static void rbd_img_request_complete(struct rbd_img_request *img_request)
1606{ 1600{
1607 1601
@@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1751 complete_all(&obj_request->completion); 1745 complete_all(&obj_request->completion);
1752} 1746}
1753 1747
1754static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1755{
1756 dout("%s: obj %p\n", __func__, obj_request);
1757 obj_request_done_set(obj_request);
1758}
1759
1760static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1748static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1761{ 1749{
1762 struct rbd_img_request *img_request = NULL; 1750 struct rbd_img_request *img_request = NULL;
@@ -1828,13 +1816,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
1828 obj_request_done_set(obj_request); 1816 obj_request_done_set(obj_request);
1829} 1817}
1830 1818
1831static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, 1819static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1832 struct ceph_msg *msg)
1833{ 1820{
1834 struct rbd_obj_request *obj_request = osd_req->r_priv; 1821 struct rbd_obj_request *obj_request = osd_req->r_priv;
1835 u16 opcode; 1822 u16 opcode;
1836 1823
1837 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); 1824 dout("%s: osd_req %p\n", __func__, osd_req);
1838 rbd_assert(osd_req == obj_request->osd_req); 1825 rbd_assert(osd_req == obj_request->osd_req);
1839 if (obj_request_img_data_test(obj_request)) { 1826 if (obj_request_img_data_test(obj_request)) {
1840 rbd_assert(obj_request->img_request); 1827 rbd_assert(obj_request->img_request);
@@ -1878,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1878 case CEPH_OSD_OP_CALL: 1865 case CEPH_OSD_OP_CALL:
1879 rbd_osd_call_callback(obj_request); 1866 rbd_osd_call_callback(obj_request);
1880 break; 1867 break;
1881 case CEPH_OSD_OP_NOTIFY_ACK:
1882 case CEPH_OSD_OP_WATCH:
1883 rbd_osd_trivial_callback(obj_request);
1884 break;
1885 default: 1868 default:
1886 rbd_warn(NULL, "%s: unsupported op %hu", 1869 rbd_warn(NULL, "%s: unsupported op %hu",
1887 obj_request->object_name, (unsigned short) opcode); 1870 obj_request->object_name, (unsigned short) opcode);
@@ -1896,27 +1879,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1896{ 1879{
1897 struct rbd_img_request *img_request = obj_request->img_request; 1880 struct rbd_img_request *img_request = obj_request->img_request;
1898 struct ceph_osd_request *osd_req = obj_request->osd_req; 1881 struct ceph_osd_request *osd_req = obj_request->osd_req;
1899 u64 snap_id;
1900 1882
1901 rbd_assert(osd_req != NULL); 1883 if (img_request)
1902 1884 osd_req->r_snapid = img_request->snap_id;
1903 snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
1904 ceph_osdc_build_request(osd_req, obj_request->offset,
1905 NULL, snap_id, NULL);
1906} 1885}
1907 1886
1908static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 1887static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1909{ 1888{
1910 struct rbd_img_request *img_request = obj_request->img_request;
1911 struct ceph_osd_request *osd_req = obj_request->osd_req; 1889 struct ceph_osd_request *osd_req = obj_request->osd_req;
1912 struct ceph_snap_context *snapc;
1913 struct timespec mtime = CURRENT_TIME;
1914 1890
1915 rbd_assert(osd_req != NULL); 1891 osd_req->r_mtime = CURRENT_TIME;
1916 1892 osd_req->r_data_offset = obj_request->offset;
1917 snapc = img_request ? img_request->snapc : NULL;
1918 ceph_osdc_build_request(osd_req, obj_request->offset,
1919 snapc, CEPH_NOSNAP, &mtime);
1920} 1893}
1921 1894
1922/* 1895/*
@@ -1954,7 +1927,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
1954 osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, 1927 osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
1955 GFP_NOIO); 1928 GFP_NOIO);
1956 if (!osd_req) 1929 if (!osd_req)
1957 return NULL; /* ENOMEM */ 1930 goto fail;
1958 1931
1959 if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 1932 if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
1960 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1933 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
@@ -1965,9 +1938,18 @@ static struct ceph_osd_request *rbd_osd_req_create(
1965 osd_req->r_priv = obj_request; 1938 osd_req->r_priv = obj_request;
1966 1939
1967 osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 1940 osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
1968 ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 1941 if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
1942 obj_request->object_name))
1943 goto fail;
1944
1945 if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
1946 goto fail;
1969 1947
1970 return osd_req; 1948 return osd_req;
1949
1950fail:
1951 ceph_osdc_put_request(osd_req);
1952 return NULL;
1971} 1953}
1972 1954
1973/* 1955/*
@@ -2003,16 +1985,25 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
2003 osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, 1985 osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
2004 false, GFP_NOIO); 1986 false, GFP_NOIO);
2005 if (!osd_req) 1987 if (!osd_req)
2006 return NULL; /* ENOMEM */ 1988 goto fail;
2007 1989
2008 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1990 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
2009 osd_req->r_callback = rbd_osd_req_callback; 1991 osd_req->r_callback = rbd_osd_req_callback;
2010 osd_req->r_priv = obj_request; 1992 osd_req->r_priv = obj_request;
2011 1993
2012 osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 1994 osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
2013 ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 1995 if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
1996 obj_request->object_name))
1997 goto fail;
1998
1999 if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
2000 goto fail;
2014 2001
2015 return osd_req; 2002 return osd_req;
2003
2004fail:
2005 ceph_osdc_put_request(osd_req);
2006 return NULL;
2016} 2007}
2017 2008
2018 2009
@@ -2973,17 +2964,20 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request)
2973{ 2964{
2974 struct rbd_obj_request *obj_request; 2965 struct rbd_obj_request *obj_request;
2975 struct rbd_obj_request *next_obj_request; 2966 struct rbd_obj_request *next_obj_request;
2967 int ret = 0;
2976 2968
2977 dout("%s: img %p\n", __func__, img_request); 2969 dout("%s: img %p\n", __func__, img_request);
2978 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2979 int ret;
2980 2970
2971 rbd_img_request_get(img_request);
2972 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2981 ret = rbd_img_obj_request_submit(obj_request); 2973 ret = rbd_img_obj_request_submit(obj_request);
2982 if (ret) 2974 if (ret)
2983 return ret; 2975 goto out_put_ireq;
2984 } 2976 }
2985 2977
2986 return 0; 2978out_put_ireq:
2979 rbd_img_request_put(img_request);
2980 return ret;
2987} 2981}
2988 2982
2989static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 2983static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
@@ -3090,45 +3084,18 @@ out_err:
3090 obj_request_done_set(obj_request); 3084 obj_request_done_set(obj_request);
3091} 3085}
3092 3086
3093static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) 3087static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev);
3094{ 3088static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev);
3095 struct rbd_obj_request *obj_request;
3096 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3097 int ret;
3098
3099 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
3100 OBJ_REQUEST_NODATA);
3101 if (!obj_request)
3102 return -ENOMEM;
3103
3104 ret = -ENOMEM;
3105 obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
3106 obj_request);
3107 if (!obj_request->osd_req)
3108 goto out;
3109
3110 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
3111 notify_id, 0, 0);
3112 rbd_osd_req_format_read(obj_request);
3113 3089
3114 ret = rbd_obj_request_submit(osdc, obj_request); 3090static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3115 if (ret) 3091 u64 notifier_id, void *data, size_t data_len)
3116 goto out;
3117 ret = rbd_obj_request_wait(obj_request);
3118out:
3119 rbd_obj_request_put(obj_request);
3120
3121 return ret;
3122}
3123
3124static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
3125{ 3092{
3126 struct rbd_device *rbd_dev = (struct rbd_device *)data; 3093 struct rbd_device *rbd_dev = arg;
3094 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3127 int ret; 3095 int ret;
3128 3096
3129 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 3097 dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev,
3130 rbd_dev->header_name, (unsigned long long)notify_id, 3098 cookie, notify_id);
3131 (unsigned int)opcode);
3132 3099
3133 /* 3100 /*
3134 * Until adequate refresh error handling is in place, there is 3101 * Until adequate refresh error handling is in place, there is
@@ -3140,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
3140 if (ret) 3107 if (ret)
3141 rbd_warn(rbd_dev, "refresh failed: %d", ret); 3108 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3142 3109
3143 ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id); 3110 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3111 &rbd_dev->header_oloc, notify_id, cookie,
3112 NULL, 0);
3144 if (ret) 3113 if (ret)
3145 rbd_warn(rbd_dev, "notify_ack ret %d", ret); 3114 rbd_warn(rbd_dev, "notify_ack ret %d", ret);
3146} 3115}
3147 3116
3148/* 3117static void rbd_watch_errcb(void *arg, u64 cookie, int err)
3149 * Send a (un)watch request and wait for the ack. Return a request
3150 * with a ref held on success or error.
3151 */
3152static struct rbd_obj_request *rbd_obj_watch_request_helper(
3153 struct rbd_device *rbd_dev,
3154 bool watch)
3155{ 3118{
3156 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3119 struct rbd_device *rbd_dev = arg;
3157 struct ceph_options *opts = osdc->client->options;
3158 struct rbd_obj_request *obj_request;
3159 int ret; 3120 int ret;
3160 3121
3161 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 3122 rbd_warn(rbd_dev, "encountered watch error: %d", err);
3162 OBJ_REQUEST_NODATA);
3163 if (!obj_request)
3164 return ERR_PTR(-ENOMEM);
3165
3166 obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
3167 obj_request);
3168 if (!obj_request->osd_req) {
3169 ret = -ENOMEM;
3170 goto out;
3171 }
3172
3173 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
3174 rbd_dev->watch_event->cookie, 0, watch);
3175 rbd_osd_req_format_write(obj_request);
3176 3123
3177 if (watch) 3124 __rbd_dev_header_unwatch_sync(rbd_dev);
3178 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
3179
3180 ret = rbd_obj_request_submit(osdc, obj_request);
3181 if (ret)
3182 goto out;
3183 3125
3184 ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout); 3126 ret = rbd_dev_header_watch_sync(rbd_dev);
3185 if (ret)
3186 goto out;
3187
3188 ret = obj_request->result;
3189 if (ret) { 3127 if (ret) {
3190 if (watch) 3128 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
3191 rbd_obj_request_end(obj_request); 3129 return;
3192 goto out;
3193 } 3130 }
3194 3131
3195 return obj_request; 3132 ret = rbd_dev_refresh(rbd_dev);
3196 3133 if (ret)
3197out: 3134 rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
3198 rbd_obj_request_put(obj_request);
3199 return ERR_PTR(ret);
3200} 3135}
3201 3136
3202/* 3137/*
@@ -3205,35 +3140,33 @@ out:
3205static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) 3140static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
3206{ 3141{
3207 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3142 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3208 struct rbd_obj_request *obj_request; 3143 struct ceph_osd_linger_request *handle;
3209 int ret;
3210 3144
3211 rbd_assert(!rbd_dev->watch_event); 3145 rbd_assert(!rbd_dev->watch_handle);
3212 rbd_assert(!rbd_dev->watch_request);
3213 3146
3214 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, 3147 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3215 &rbd_dev->watch_event); 3148 &rbd_dev->header_oloc, rbd_watch_cb,
3216 if (ret < 0) 3149 rbd_watch_errcb, rbd_dev);
3217 return ret; 3150 if (IS_ERR(handle))
3151 return PTR_ERR(handle);
3218 3152
3219 obj_request = rbd_obj_watch_request_helper(rbd_dev, true); 3153 rbd_dev->watch_handle = handle;
3220 if (IS_ERR(obj_request)) { 3154 return 0;
3221 ceph_osdc_cancel_event(rbd_dev->watch_event); 3155}
3222 rbd_dev->watch_event = NULL;
3223 return PTR_ERR(obj_request);
3224 }
3225 3156
3226 /* 3157static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
3227 * A watch request is set to linger, so the underlying osd 3158{
3228 * request won't go away until we unregister it. We retain 3159 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3229 * a pointer to the object request during that time (in 3160 int ret;
3230 * rbd_dev->watch_request), so we'll keep a reference to it.
3231 * We'll drop that reference after we've unregistered it in
3232 * rbd_dev_header_unwatch_sync().
3233 */
3234 rbd_dev->watch_request = obj_request;
3235 3161
3236 return 0; 3162 if (!rbd_dev->watch_handle)
3163 return;
3164
3165 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3166 if (ret)
3167 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
3168
3169 rbd_dev->watch_handle = NULL;
3237} 3170}
3238 3171
3239/* 3172/*
@@ -3241,24 +3174,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
3241 */ 3174 */
3242static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 3175static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
3243{ 3176{
3244 struct rbd_obj_request *obj_request; 3177 __rbd_dev_header_unwatch_sync(rbd_dev);
3245
3246 rbd_assert(rbd_dev->watch_event);
3247 rbd_assert(rbd_dev->watch_request);
3248
3249 rbd_obj_request_end(rbd_dev->watch_request);
3250 rbd_obj_request_put(rbd_dev->watch_request);
3251 rbd_dev->watch_request = NULL;
3252
3253 obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
3254 if (!IS_ERR(obj_request))
3255 rbd_obj_request_put(obj_request);
3256 else
3257 rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
3258 PTR_ERR(obj_request));
3259
3260 ceph_osdc_cancel_event(rbd_dev->watch_event);
3261 rbd_dev->watch_event = NULL;
3262 3178
3263 dout("%s flushing notifies\n", __func__); 3179 dout("%s flushing notifies\n", __func__);
3264 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 3180 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
@@ -3591,7 +3507,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
3591 if (!ondisk) 3507 if (!ondisk)
3592 return -ENOMEM; 3508 return -ENOMEM;
3593 3509
3594 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 3510 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
3595 0, size, ondisk); 3511 0, size, ondisk);
3596 if (ret < 0) 3512 if (ret < 0)
3597 goto out; 3513 goto out;
@@ -4033,6 +3949,8 @@ static void rbd_dev_release(struct device *dev)
4033 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3949 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4034 bool need_put = !!rbd_dev->opts; 3950 bool need_put = !!rbd_dev->opts;
4035 3951
3952 ceph_oid_destroy(&rbd_dev->header_oid);
3953
4036 rbd_put_client(rbd_dev->rbd_client); 3954 rbd_put_client(rbd_dev->rbd_client);
4037 rbd_spec_put(rbd_dev->spec); 3955 rbd_spec_put(rbd_dev->spec);
4038 kfree(rbd_dev->opts); 3956 kfree(rbd_dev->opts);
@@ -4063,6 +3981,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4063 INIT_LIST_HEAD(&rbd_dev->node); 3981 INIT_LIST_HEAD(&rbd_dev->node);
4064 init_rwsem(&rbd_dev->header_rwsem); 3982 init_rwsem(&rbd_dev->header_rwsem);
4065 3983
3984 ceph_oid_init(&rbd_dev->header_oid);
3985 ceph_oloc_init(&rbd_dev->header_oloc);
3986
4066 rbd_dev->dev.bus = &rbd_bus_type; 3987 rbd_dev->dev.bus = &rbd_bus_type;
4067 rbd_dev->dev.type = &rbd_device_type; 3988 rbd_dev->dev.type = &rbd_device_type;
4068 rbd_dev->dev.parent = &rbd_root_dev; 3989 rbd_dev->dev.parent = &rbd_root_dev;
@@ -4111,7 +4032,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4111 __le64 size; 4032 __le64 size;
4112 } __attribute__ ((packed)) size_buf = { 0 }; 4033 } __attribute__ ((packed)) size_buf = { 0 };
4113 4034
4114 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4035 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4115 "rbd", "get_size", 4036 "rbd", "get_size",
4116 &snapid, sizeof (snapid), 4037 &snapid, sizeof (snapid),
4117 &size_buf, sizeof (size_buf)); 4038 &size_buf, sizeof (size_buf));
@@ -4151,7 +4072,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4151 if (!reply_buf) 4072 if (!reply_buf)
4152 return -ENOMEM; 4073 return -ENOMEM;
4153 4074
4154 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4075 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4155 "rbd", "get_object_prefix", NULL, 0, 4076 "rbd", "get_object_prefix", NULL, 0,
4156 reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 4077 reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
4157 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4078 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4186,7 +4107,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4186 u64 unsup; 4107 u64 unsup;
4187 int ret; 4108 int ret;
4188 4109
4189 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4110 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4190 "rbd", "get_features", 4111 "rbd", "get_features",
4191 &snapid, sizeof (snapid), 4112 &snapid, sizeof (snapid),
4192 &features_buf, sizeof (features_buf)); 4113 &features_buf, sizeof (features_buf));
@@ -4248,7 +4169,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4248 } 4169 }
4249 4170
4250 snapid = cpu_to_le64(rbd_dev->spec->snap_id); 4171 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
4251 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4172 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4252 "rbd", "get_parent", 4173 "rbd", "get_parent",
4253 &snapid, sizeof (snapid), 4174 &snapid, sizeof (snapid),
4254 reply_buf, size); 4175 reply_buf, size);
@@ -4351,7 +4272,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4351 u64 stripe_count; 4272 u64 stripe_count;
4352 int ret; 4273 int ret;
4353 4274
4354 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4275 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4355 "rbd", "get_stripe_unit_count", NULL, 0, 4276 "rbd", "get_stripe_unit_count", NULL, 0,
4356 (char *)&striping_info_buf, size); 4277 (char *)&striping_info_buf, size);
4357 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4278 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4599,7 +4520,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
4599 if (!reply_buf) 4520 if (!reply_buf)
4600 return -ENOMEM; 4521 return -ENOMEM;
4601 4522
4602 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4523 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4603 "rbd", "get_snapcontext", NULL, 0, 4524 "rbd", "get_snapcontext", NULL, 0,
4604 reply_buf, size); 4525 reply_buf, size);
4605 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4526 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4664,7 +4585,7 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4664 return ERR_PTR(-ENOMEM); 4585 return ERR_PTR(-ENOMEM);
4665 4586
4666 snapid = cpu_to_le64(snap_id); 4587 snapid = cpu_to_le64(snap_id);
4667 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4588 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4668 "rbd", "get_snapshot_name", 4589 "rbd", "get_snapshot_name",
4669 &snapid, sizeof (snapid), 4590 &snapid, sizeof (snapid),
4670 reply_buf, size); 4591 reply_buf, size);
@@ -4975,13 +4896,13 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
4975again: 4896again:
4976 ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); 4897 ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
4977 if (ret == -ENOENT && tries++ < 1) { 4898 if (ret == -ENOENT && tries++ < 1) {
4978 ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", 4899 ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
4979 &newest_epoch); 4900 &newest_epoch);
4980 if (ret < 0) 4901 if (ret < 0)
4981 return ret; 4902 return ret;
4982 4903
4983 if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { 4904 if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
4984 ceph_monc_request_next_osdmap(&rbdc->client->monc); 4905 ceph_osdc_maybe_request_map(&rbdc->client->osdc);
4985 (void) ceph_monc_wait_osdmap(&rbdc->client->monc, 4906 (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
4986 newest_epoch, 4907 newest_epoch,
4987 opts->mount_timeout); 4908 opts->mount_timeout);
@@ -5260,35 +5181,26 @@ err_out_unlock:
5260static int rbd_dev_header_name(struct rbd_device *rbd_dev) 5181static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5261{ 5182{
5262 struct rbd_spec *spec = rbd_dev->spec; 5183 struct rbd_spec *spec = rbd_dev->spec;
5263 size_t size; 5184 int ret;
5264 5185
5265 /* Record the header object name for this rbd image. */ 5186 /* Record the header object name for this rbd image. */
5266 5187
5267 rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5188 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5268 5189
5190 rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
5269 if (rbd_dev->image_format == 1) 5191 if (rbd_dev->image_format == 1)
5270 size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); 5192 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5193 spec->image_name, RBD_SUFFIX);
5271 else 5194 else
5272 size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); 5195 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5273 5196 RBD_HEADER_PREFIX, spec->image_id);
5274 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
5275 if (!rbd_dev->header_name)
5276 return -ENOMEM;
5277 5197
5278 if (rbd_dev->image_format == 1) 5198 return ret;
5279 sprintf(rbd_dev->header_name, "%s%s",
5280 spec->image_name, RBD_SUFFIX);
5281 else
5282 sprintf(rbd_dev->header_name, "%s%s",
5283 RBD_HEADER_PREFIX, spec->image_id);
5284 return 0;
5285} 5199}
5286 5200
5287static void rbd_dev_image_release(struct rbd_device *rbd_dev) 5201static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5288{ 5202{
5289 rbd_dev_unprobe(rbd_dev); 5203 rbd_dev_unprobe(rbd_dev);
5290 kfree(rbd_dev->header_name);
5291 rbd_dev->header_name = NULL;
5292 rbd_dev->image_format = 0; 5204 rbd_dev->image_format = 0;
5293 kfree(rbd_dev->spec->image_id); 5205 kfree(rbd_dev->spec->image_id);
5294 rbd_dev->spec->image_id = NULL; 5206 rbd_dev->spec->image_id = NULL;
@@ -5327,7 +5239,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
5327 pr_info("image %s/%s does not exist\n", 5239 pr_info("image %s/%s does not exist\n",
5328 rbd_dev->spec->pool_name, 5240 rbd_dev->spec->pool_name,
5329 rbd_dev->spec->image_name); 5241 rbd_dev->spec->image_name);
5330 goto out_header_name; 5242 goto err_out_format;
5331 } 5243 }
5332 } 5244 }
5333 5245
@@ -5373,7 +5285,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
5373 goto err_out_probe; 5285 goto err_out_probe;
5374 5286
5375 dout("discovered format %u image, header name is %s\n", 5287 dout("discovered format %u image, header name is %s\n",
5376 rbd_dev->image_format, rbd_dev->header_name); 5288 rbd_dev->image_format, rbd_dev->header_oid.name);
5377 return 0; 5289 return 0;
5378 5290
5379err_out_probe: 5291err_out_probe:
@@ -5381,9 +5293,6 @@ err_out_probe:
5381err_out_watch: 5293err_out_watch:
5382 if (!depth) 5294 if (!depth)
5383 rbd_dev_header_unwatch_sync(rbd_dev); 5295 rbd_dev_header_unwatch_sync(rbd_dev);
5384out_header_name:
5385 kfree(rbd_dev->header_name);
5386 rbd_dev->header_name = NULL;
5387err_out_format: 5296err_out_format:
5388 rbd_dev->image_format = 0; 5297 rbd_dev->image_format = 0;
5389 kfree(rbd_dev->spec->image_id); 5298 kfree(rbd_dev->spec->image_id);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 43098cd9602b..eeb71e5de27a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page)
257/* 257/*
258 * Finish an async read(ahead) op. 258 * Finish an async read(ahead) op.
259 */ 259 */
260static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) 260static void finish_read(struct ceph_osd_request *req)
261{ 261{
262 struct inode *inode = req->r_inode; 262 struct inode *inode = req->r_inode;
263 struct ceph_osd_data *osd_data; 263 struct ceph_osd_data *osd_data;
264 int rc = req->r_result; 264 int rc = req->r_result <= 0 ? req->r_result : 0;
265 int bytes = le32_to_cpu(msg->hdr.data_len); 265 int bytes = req->r_result >= 0 ? req->r_result : 0;
266 int num_pages; 266 int num_pages;
267 int i; 267 int i;
268 268
@@ -376,8 +376,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
376 req->r_callback = finish_read; 376 req->r_callback = finish_read;
377 req->r_inode = inode; 377 req->r_inode = inode;
378 378
379 ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
380
381 dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); 379 dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
382 ret = ceph_osdc_start_request(osdc, req, false); 380 ret = ceph_osdc_start_request(osdc, req, false);
383 if (ret < 0) 381 if (ret < 0)
@@ -546,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
546 truncate_seq, truncate_size, 544 truncate_seq, truncate_size,
547 &inode->i_mtime, &page, 1); 545 &inode->i_mtime, &page, 1);
548 if (err < 0) { 546 if (err < 0) {
549 dout("writepage setting page/mapping error %d %p\n", err, page); 547 struct writeback_control tmp_wbc;
548 if (!wbc)
549 wbc = &tmp_wbc;
550 if (err == -ERESTARTSYS) {
551 /* killed by SIGKILL */
552 dout("writepage interrupted page %p\n", page);
553 redirty_page_for_writepage(wbc, page);
554 end_page_writeback(page);
555 goto out;
556 }
557 dout("writepage setting page/mapping error %d %p\n",
558 err, page);
550 SetPageError(page); 559 SetPageError(page);
551 mapping_set_error(&inode->i_data, err); 560 mapping_set_error(&inode->i_data, err);
552 if (wbc) 561 wbc->pages_skipped++;
553 wbc->pages_skipped++;
554 } else { 562 } else {
555 dout("writepage cleaned page %p\n", page); 563 dout("writepage cleaned page %p\n", page);
556 err = 0; /* vfs expects us to return 0 */ 564 err = 0; /* vfs expects us to return 0 */
@@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
571 BUG_ON(!inode); 579 BUG_ON(!inode);
572 ihold(inode); 580 ihold(inode);
573 err = writepage_nounlock(page, wbc); 581 err = writepage_nounlock(page, wbc);
582 if (err == -ERESTARTSYS) {
583 /* direct memory reclaimer was killed by SIGKILL. return 0
584 * to prevent caller from setting mapping/page error */
585 err = 0;
586 }
574 unlock_page(page); 587 unlock_page(page);
575 iput(inode); 588 iput(inode);
576 return err; 589 return err;
577} 590}
578 591
579
580/* 592/*
581 * lame release_pages helper. release_pages() isn't exported to 593 * lame release_pages helper. release_pages() isn't exported to
582 * modules. 594 * modules.
@@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num)
600 * If we get an error, set the mapping error bit, but not the individual 612 * If we get an error, set the mapping error bit, but not the individual
601 * page error bits. 613 * page error bits.
602 */ 614 */
603static void writepages_finish(struct ceph_osd_request *req, 615static void writepages_finish(struct ceph_osd_request *req)
604 struct ceph_msg *msg)
605{ 616{
606 struct inode *inode = req->r_inode; 617 struct inode *inode = req->r_inode;
607 struct ceph_inode_info *ci = ceph_inode(inode); 618 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req,
615 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 626 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
616 bool remove_page; 627 bool remove_page;
617 628
618
619 dout("writepages_finish %p rc %d\n", inode, rc); 629 dout("writepages_finish %p rc %d\n", inode, rc);
620 if (rc < 0) 630 if (rc < 0)
621 mapping_set_error(mapping, rc); 631 mapping_set_error(mapping, rc);
@@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req,
650 clear_bdi_congested(&fsc->backing_dev_info, 660 clear_bdi_congested(&fsc->backing_dev_info,
651 BLK_RW_ASYNC); 661 BLK_RW_ASYNC);
652 662
663 if (rc < 0)
664 SetPageError(page);
665
653 ceph_put_snap_context(page_snap_context(page)); 666 ceph_put_snap_context(page_snap_context(page));
654 page->private = 0; 667 page->private = 0;
655 ClearPagePrivate(page); 668 ClearPagePrivate(page);
@@ -718,8 +731,11 @@ static int ceph_writepages_start(struct address_space *mapping,
718 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 731 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
719 732
720 if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { 733 if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
721 pr_warn("writepage_start %p on forced umount\n", inode); 734 if (ci->i_wrbuffer_ref > 0) {
722 truncate_pagecache(inode, 0); 735 pr_warn_ratelimited(
736 "writepage_start %p %lld forced umount\n",
737 inode, ceph_ino(inode));
738 }
723 mapping_set_error(mapping, -EIO); 739 mapping_set_error(mapping, -EIO);
724 return -EIO; /* we're in a forced umount, don't write! */ 740 return -EIO; /* we're in a forced umount, don't write! */
725 } 741 }
@@ -1063,10 +1079,7 @@ new_request:
1063 pages = NULL; 1079 pages = NULL;
1064 } 1080 }
1065 1081
1066 vino = ceph_vino(inode); 1082 req->r_mtime = inode->i_mtime;
1067 ceph_osdc_build_request(req, offset, snapc, vino.snap,
1068 &inode->i_mtime);
1069
1070 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); 1083 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
1071 BUG_ON(rc); 1084 BUG_ON(rc);
1072 req = NULL; 1085 req = NULL;
@@ -1099,8 +1112,7 @@ release_pvec_pages:
1099 mapping->writeback_index = index; 1112 mapping->writeback_index = index;
1100 1113
1101out: 1114out:
1102 if (req) 1115 ceph_osdc_put_request(req);
1103 ceph_osdc_put_request(req);
1104 ceph_put_snap_context(snapc); 1116 ceph_put_snap_context(snapc);
1105 dout("writepages done, rc = %d\n", rc); 1117 dout("writepages done, rc = %d\n", rc);
1106 return rc; 1118 return rc;
@@ -1134,6 +1146,7 @@ static int ceph_update_writeable_page(struct file *file,
1134 struct page *page) 1146 struct page *page)
1135{ 1147{
1136 struct inode *inode = file_inode(file); 1148 struct inode *inode = file_inode(file);
1149 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1137 struct ceph_inode_info *ci = ceph_inode(inode); 1150 struct ceph_inode_info *ci = ceph_inode(inode);
1138 loff_t page_off = pos & PAGE_MASK; 1151 loff_t page_off = pos & PAGE_MASK;
1139 int pos_in_page = pos & ~PAGE_MASK; 1152 int pos_in_page = pos & ~PAGE_MASK;
@@ -1142,6 +1155,12 @@ static int ceph_update_writeable_page(struct file *file,
1142 int r; 1155 int r;
1143 struct ceph_snap_context *snapc, *oldest; 1156 struct ceph_snap_context *snapc, *oldest;
1144 1157
1158 if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
1159 dout(" page %p forced umount\n", page);
1160 unlock_page(page);
1161 return -EIO;
1162 }
1163
1145retry_locked: 1164retry_locked:
1146 /* writepages currently holds page lock, but if we change that later, */ 1165 /* writepages currently holds page lock, but if we change that later, */
1147 wait_on_page_writeback(page); 1166 wait_on_page_writeback(page);
@@ -1165,7 +1184,7 @@ retry_locked:
1165 snapc = ceph_get_snap_context(snapc); 1184 snapc = ceph_get_snap_context(snapc);
1166 unlock_page(page); 1185 unlock_page(page);
1167 ceph_queue_writeback(inode); 1186 ceph_queue_writeback(inode);
1168 r = wait_event_interruptible(ci->i_cap_wq, 1187 r = wait_event_killable(ci->i_cap_wq,
1169 context_is_writeable_or_written(inode, snapc)); 1188 context_is_writeable_or_written(inode, snapc));
1170 ceph_put_snap_context(snapc); 1189 ceph_put_snap_context(snapc);
1171 if (r == -ERESTARTSYS) 1190 if (r == -ERESTARTSYS)
@@ -1311,6 +1330,17 @@ const struct address_space_operations ceph_aops = {
1311 .direct_IO = ceph_direct_io, 1330 .direct_IO = ceph_direct_io,
1312}; 1331};
1313 1332
1333static void ceph_block_sigs(sigset_t *oldset)
1334{
1335 sigset_t mask;
1336 siginitsetinv(&mask, sigmask(SIGKILL));
1337 sigprocmask(SIG_BLOCK, &mask, oldset);
1338}
1339
1340static void ceph_restore_sigs(sigset_t *oldset)
1341{
1342 sigprocmask(SIG_SETMASK, oldset, NULL);
1343}
1314 1344
1315/* 1345/*
1316 * vm ops 1346 * vm ops
@@ -1323,6 +1353,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1323 struct page *pinned_page = NULL; 1353 struct page *pinned_page = NULL;
1324 loff_t off = vmf->pgoff << PAGE_SHIFT; 1354 loff_t off = vmf->pgoff << PAGE_SHIFT;
1325 int want, got, ret; 1355 int want, got, ret;
1356 sigset_t oldset;
1357
1358 ceph_block_sigs(&oldset);
1326 1359
1327 dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", 1360 dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
1328 inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE); 1361 inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
@@ -1330,17 +1363,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1330 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 1363 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
1331 else 1364 else
1332 want = CEPH_CAP_FILE_CACHE; 1365 want = CEPH_CAP_FILE_CACHE;
1333 while (1) { 1366
1334 got = 0; 1367 got = 0;
1335 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, 1368 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
1336 -1, &got, &pinned_page); 1369 if (ret < 0)
1337 if (ret == 0) 1370 goto out_restore;
1338 break; 1371
1339 if (ret != -ERESTARTSYS) {
1340 WARN_ON(1);
1341 return VM_FAULT_SIGBUS;
1342 }
1343 }
1344 dout("filemap_fault %p %llu~%zd got cap refs on %s\n", 1372 dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
1345 inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); 1373 inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
1346 1374
@@ -1357,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1357 ceph_put_cap_refs(ci, got); 1385 ceph_put_cap_refs(ci, got);
1358 1386
1359 if (ret != -EAGAIN) 1387 if (ret != -EAGAIN)
1360 return ret; 1388 goto out_restore;
1361 1389
1362 /* read inline data */ 1390 /* read inline data */
1363 if (off >= PAGE_SIZE) { 1391 if (off >= PAGE_SIZE) {
@@ -1371,15 +1399,18 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1371 ~__GFP_FS)); 1399 ~__GFP_FS));
1372 if (!page) { 1400 if (!page) {
1373 ret = VM_FAULT_OOM; 1401 ret = VM_FAULT_OOM;
1374 goto out; 1402 goto out_inline;
1375 } 1403 }
1376 ret1 = __ceph_do_getattr(inode, page, 1404 ret1 = __ceph_do_getattr(inode, page,
1377 CEPH_STAT_CAP_INLINE_DATA, true); 1405 CEPH_STAT_CAP_INLINE_DATA, true);
1378 if (ret1 < 0 || off >= i_size_read(inode)) { 1406 if (ret1 < 0 || off >= i_size_read(inode)) {
1379 unlock_page(page); 1407 unlock_page(page);
1380 put_page(page); 1408 put_page(page);
1381 ret = VM_FAULT_SIGBUS; 1409 if (ret1 < 0)
1382 goto out; 1410 ret = ret1;
1411 else
1412 ret = VM_FAULT_SIGBUS;
1413 goto out_inline;
1383 } 1414 }
1384 if (ret1 < PAGE_SIZE) 1415 if (ret1 < PAGE_SIZE)
1385 zero_user_segment(page, ret1, PAGE_SIZE); 1416 zero_user_segment(page, ret1, PAGE_SIZE);
@@ -1388,10 +1419,15 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1388 SetPageUptodate(page); 1419 SetPageUptodate(page);
1389 vmf->page = page; 1420 vmf->page = page;
1390 ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; 1421 ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
1422out_inline:
1423 dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
1424 inode, off, (size_t)PAGE_SIZE, ret);
1391 } 1425 }
1392out: 1426out_restore:
1393 dout("filemap_fault %p %llu~%zd read inline data ret %d\n", 1427 ceph_restore_sigs(&oldset);
1394 inode, off, (size_t)PAGE_SIZE, ret); 1428 if (ret < 0)
1429 ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
1430
1395 return ret; 1431 return ret;
1396} 1432}
1397 1433
@@ -1409,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1409 loff_t size = i_size_read(inode); 1445 loff_t size = i_size_read(inode);
1410 size_t len; 1446 size_t len;
1411 int want, got, ret; 1447 int want, got, ret;
1448 sigset_t oldset;
1412 1449
1413 prealloc_cf = ceph_alloc_cap_flush(); 1450 prealloc_cf = ceph_alloc_cap_flush();
1414 if (!prealloc_cf) 1451 if (!prealloc_cf)
1415 return VM_FAULT_SIGBUS; 1452 return VM_FAULT_OOM;
1453
1454 ceph_block_sigs(&oldset);
1416 1455
1417 if (ci->i_inline_version != CEPH_INLINE_NONE) { 1456 if (ci->i_inline_version != CEPH_INLINE_NONE) {
1418 struct page *locked_page = NULL; 1457 struct page *locked_page = NULL;
@@ -1423,10 +1462,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1423 ret = ceph_uninline_data(vma->vm_file, locked_page); 1462 ret = ceph_uninline_data(vma->vm_file, locked_page);
1424 if (locked_page) 1463 if (locked_page)
1425 unlock_page(locked_page); 1464 unlock_page(locked_page);
1426 if (ret < 0) { 1465 if (ret < 0)
1427 ret = VM_FAULT_SIGBUS;
1428 goto out_free; 1466 goto out_free;
1429 }
1430 } 1467 }
1431 1468
1432 if (off + PAGE_SIZE <= size) 1469 if (off + PAGE_SIZE <= size)
@@ -1440,45 +1477,36 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1440 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; 1477 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
1441 else 1478 else
1442 want = CEPH_CAP_FILE_BUFFER; 1479 want = CEPH_CAP_FILE_BUFFER;
1443 while (1) { 1480
1444 got = 0; 1481 got = 0;
1445 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, 1482 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
1446 &got, NULL); 1483 &got, NULL);
1447 if (ret == 0) 1484 if (ret < 0)
1448 break; 1485 goto out_free;
1449 if (ret != -ERESTARTSYS) { 1486
1450 WARN_ON(1);
1451 ret = VM_FAULT_SIGBUS;
1452 goto out_free;
1453 }
1454 }
1455 dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", 1487 dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
1456 inode, off, len, ceph_cap_string(got)); 1488 inode, off, len, ceph_cap_string(got));
1457 1489
1458 /* Update time before taking page lock */ 1490 /* Update time before taking page lock */
1459 file_update_time(vma->vm_file); 1491 file_update_time(vma->vm_file);
1460 1492
1461 lock_page(page); 1493 do {
1494 lock_page(page);
1462 1495
1463 ret = VM_FAULT_NOPAGE; 1496 if ((off > size) || (page->mapping != inode->i_mapping)) {
1464 if ((off > size) || 1497 unlock_page(page);
1465 (page->mapping != inode->i_mapping)) { 1498 ret = VM_FAULT_NOPAGE;
1466 unlock_page(page); 1499 break;
1467 goto out; 1500 }
1468 } 1501
1502 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1503 if (ret >= 0) {
1504 /* success. we'll keep the page locked. */
1505 set_page_dirty(page);
1506 ret = VM_FAULT_LOCKED;
1507 }
1508 } while (ret == -EAGAIN);
1469 1509
1470 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1471 if (ret >= 0) {
1472 /* success. we'll keep the page locked. */
1473 set_page_dirty(page);
1474 ret = VM_FAULT_LOCKED;
1475 } else {
1476 if (ret == -ENOMEM)
1477 ret = VM_FAULT_OOM;
1478 else
1479 ret = VM_FAULT_SIGBUS;
1480 }
1481out:
1482 if (ret == VM_FAULT_LOCKED || 1510 if (ret == VM_FAULT_LOCKED ||
1483 ci->i_inline_version != CEPH_INLINE_NONE) { 1511 ci->i_inline_version != CEPH_INLINE_NONE) {
1484 int dirty; 1512 int dirty;
@@ -1495,8 +1523,10 @@ out:
1495 inode, off, len, ceph_cap_string(got), ret); 1523 inode, off, len, ceph_cap_string(got), ret);
1496 ceph_put_cap_refs(ci, got); 1524 ceph_put_cap_refs(ci, got);
1497out_free: 1525out_free:
1526 ceph_restore_sigs(&oldset);
1498 ceph_free_cap_flush(prealloc_cf); 1527 ceph_free_cap_flush(prealloc_cf);
1499 1528 if (ret < 0)
1529 ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
1500 return ret; 1530 return ret;
1501} 1531}
1502 1532
@@ -1614,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
1614 goto out; 1644 goto out;
1615 } 1645 }
1616 1646
1617 ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); 1647 req->r_mtime = inode->i_mtime;
1618 err = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1648 err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
1619 if (!err) 1649 if (!err)
1620 err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1650 err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1657,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
1657 goto out_put; 1687 goto out_put;
1658 } 1688 }
1659 1689
1660 ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); 1690 req->r_mtime = inode->i_mtime;
1661 err = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1691 err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
1662 if (!err) 1692 if (!err)
1663 err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1693 err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1758,9 +1788,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
1758 rd_req->r_flags = CEPH_OSD_FLAG_READ; 1788 rd_req->r_flags = CEPH_OSD_FLAG_READ;
1759 osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); 1789 osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
1760 rd_req->r_base_oloc.pool = pool; 1790 rd_req->r_base_oloc.pool = pool;
1761 snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name), 1791 ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
1762 "%llx.00000000", ci->i_vino.ino); 1792
1763 rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); 1793 err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
1794 if (err)
1795 goto out_unlock;
1764 1796
1765 wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1797 wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
1766 1, false, GFP_NOFS); 1798 1, false, GFP_NOFS);
@@ -1769,11 +1801,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
1769 goto out_unlock; 1801 goto out_unlock;
1770 } 1802 }
1771 1803
1772 wr_req->r_flags = CEPH_OSD_FLAG_WRITE | 1804 wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
1773 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
1774 osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); 1805 osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
1775 wr_req->r_base_oloc.pool = pool; 1806 ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
1776 wr_req->r_base_oid = rd_req->r_base_oid; 1807 ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
1808
1809 err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
1810 if (err)
1811 goto out_unlock;
1777 1812
1778 /* one page should be large enough for STAT data */ 1813 /* one page should be large enough for STAT data */
1779 pages = ceph_alloc_page_vector(1, GFP_KERNEL); 1814 pages = ceph_alloc_page_vector(1, GFP_KERNEL);
@@ -1784,12 +1819,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
1784 1819
1785 osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, 1820 osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
1786 0, false, true); 1821 0, false, true);
1787 ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
1788 &ci->vfs_inode.i_mtime);
1789 err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); 1822 err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
1790 1823
1791 ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP, 1824 wr_req->r_mtime = ci->vfs_inode.i_mtime;
1792 &ci->vfs_inode.i_mtime);
1793 err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); 1825 err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
1794 1826
1795 if (!err) 1827 if (!err)
@@ -1823,10 +1855,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
1823out_unlock: 1855out_unlock:
1824 up_write(&mdsc->pool_perm_rwsem); 1856 up_write(&mdsc->pool_perm_rwsem);
1825 1857
1826 if (rd_req) 1858 ceph_osdc_put_request(rd_req);
1827 ceph_osdc_put_request(rd_req); 1859 ceph_osdc_put_request(wr_req);
1828 if (wr_req)
1829 ceph_osdc_put_request(wr_req);
1830out: 1860out:
1831 if (!err) 1861 if (!err)
1832 err = have; 1862 err = have;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index a351480dbabc..c052b5bf219b 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -236,7 +236,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int
236 unlock_page(page); 236 unlock_page(page);
237} 237}
238 238
239static inline int cache_valid(struct ceph_inode_info *ci) 239static inline bool cache_valid(struct ceph_inode_info *ci)
240{ 240{
241 return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && 241 return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
242 (ci->i_fscache_gen == ci->i_rdcache_gen)); 242 (ci->i_fscache_gen == ci->i_rdcache_gen));
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index cfaeef18cbca..c17b5d76d75e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1656,7 +1656,7 @@ retry_locked:
1656 */ 1656 */
1657 if ((!is_delayed || mdsc->stopping) && 1657 if ((!is_delayed || mdsc->stopping) &&
1658 !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ 1658 !S_ISDIR(inode->i_mode) && /* ignore readdir cache */
1659 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ 1659 !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
1660 inode->i_data.nrpages && /* have cached pages */ 1660 inode->i_data.nrpages && /* have cached pages */
1661 (revoking & (CEPH_CAP_FILE_CACHE| 1661 (revoking & (CEPH_CAP_FILE_CACHE|
1662 CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ 1662 CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
@@ -1698,8 +1698,8 @@ retry_locked:
1698 1698
1699 revoking = cap->implemented & ~cap->issued; 1699 revoking = cap->implemented & ~cap->issued;
1700 dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", 1700 dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
1701 cap->mds, cap, ceph_cap_string(cap->issued), 1701 cap->mds, cap, ceph_cap_string(cap_used),
1702 ceph_cap_string(cap_used), 1702 ceph_cap_string(cap->issued),
1703 ceph_cap_string(cap->implemented), 1703 ceph_cap_string(cap->implemented),
1704 ceph_cap_string(revoking)); 1704 ceph_cap_string(revoking));
1705 1705
@@ -2317,7 +2317,7 @@ again:
2317 2317
2318 /* make sure file is actually open */ 2318 /* make sure file is actually open */
2319 file_wanted = __ceph_caps_file_wanted(ci); 2319 file_wanted = __ceph_caps_file_wanted(ci);
2320 if ((file_wanted & need) == 0) { 2320 if ((file_wanted & need) != need) {
2321 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", 2321 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
2322 ceph_cap_string(need), ceph_cap_string(file_wanted)); 2322 ceph_cap_string(need), ceph_cap_string(file_wanted));
2323 *err = -EBADF; 2323 *err = -EBADF;
@@ -2412,12 +2412,26 @@ again:
2412 goto out_unlock; 2412 goto out_unlock;
2413 } 2413 }
2414 2414
2415 if (!__ceph_is_any_caps(ci) && 2415 if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
2416 ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { 2416 int mds_wanted;
2417 dout("get_cap_refs %p forced umount\n", inode); 2417 if (ACCESS_ONCE(mdsc->fsc->mount_state) ==
2418 *err = -EIO; 2418 CEPH_MOUNT_SHUTDOWN) {
2419 ret = 1; 2419 dout("get_cap_refs %p forced umount\n", inode);
2420 goto out_unlock; 2420 *err = -EIO;
2421 ret = 1;
2422 goto out_unlock;
2423 }
2424 mds_wanted = __ceph_caps_mds_wanted(ci);
2425 if ((mds_wanted & need) != need) {
2426 dout("get_cap_refs %p caps were dropped"
2427 " (session killed?)\n", inode);
2428 *err = -ESTALE;
2429 ret = 1;
2430 goto out_unlock;
2431 }
2432 if ((mds_wanted & file_wanted) ==
2433 (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
2434 ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
2421 } 2435 }
2422 2436
2423 dout("get_cap_refs %p have %s needed %s\n", inode, 2437 dout("get_cap_refs %p have %s needed %s\n", inode,
@@ -2487,7 +2501,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2487 if (err == -EAGAIN) 2501 if (err == -EAGAIN)
2488 continue; 2502 continue;
2489 if (err < 0) 2503 if (err < 0)
2490 return err; 2504 ret = err;
2491 } else { 2505 } else {
2492 ret = wait_event_interruptible(ci->i_cap_wq, 2506 ret = wait_event_interruptible(ci->i_cap_wq,
2493 try_get_cap_refs(ci, need, want, endoff, 2507 try_get_cap_refs(ci, need, want, endoff,
@@ -2496,8 +2510,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2496 continue; 2510 continue;
2497 if (err < 0) 2511 if (err < 0)
2498 ret = err; 2512 ret = err;
2499 if (ret < 0) 2513 }
2500 return ret; 2514 if (ret < 0) {
2515 if (err == -ESTALE) {
2516 /* session was killed, try renew caps */
2517 ret = ceph_renew_caps(&ci->vfs_inode);
2518 if (ret == 0)
2519 continue;
2520 }
2521 return ret;
2501 } 2522 }
2502 2523
2503 if (ci->i_inline_version != CEPH_INLINE_NONE && 2524 if (ci->i_inline_version != CEPH_INLINE_NONE &&
@@ -2807,7 +2828,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
2807 if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ 2828 if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
2808 ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && 2829 ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2809 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && 2830 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2810 !ci->i_wrbuffer_ref) { 2831 !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
2811 if (try_nonblocking_invalidate(inode)) { 2832 if (try_nonblocking_invalidate(inode)) {
2812 /* there were locked pages.. invalidate later 2833 /* there were locked pages.. invalidate later
2813 in a separate thread. */ 2834 in a separate thread. */
@@ -3226,6 +3247,8 @@ retry:
3226 3247
3227 if (target < 0) { 3248 if (target < 0) {
3228 __ceph_remove_cap(cap, false); 3249 __ceph_remove_cap(cap, false);
3250 if (!ci->i_auth_cap)
3251 ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
3229 goto out_unlock; 3252 goto out_unlock;
3230 } 3253 }
3231 3254
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 31f831471ed2..39ff678e567f 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
109 path ? path : ""); 109 path ? path : "");
110 spin_unlock(&req->r_old_dentry->d_lock); 110 spin_unlock(&req->r_old_dentry->d_lock);
111 kfree(path); 111 kfree(path);
112 } else if (req->r_path2) { 112 } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
113 if (req->r_ino2.ino) 113 if (req->r_ino2.ino)
114 seq_printf(s, " #%llx/%s", req->r_ino2.ino, 114 seq_printf(s, " #%llx/%s", req->r_ino2.ino,
115 req->r_path2); 115 req->r_path2);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 3ab1192d2029..6e0fedf6713b 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -70,16 +70,42 @@ out_unlock:
70} 70}
71 71
72/* 72/*
73 * for readdir, we encode the directory frag and offset within that 73 * for f_pos for readdir:
74 * frag into f_pos. 74 * - hash order:
75 * (0xff << 52) | ((24 bits hash) << 28) |
76 * (the nth entry has hash collision);
77 * - frag+name order;
78 * ((frag value) << 28) | (the nth entry in frag);
75 */ 79 */
80#define OFFSET_BITS 28
81#define OFFSET_MASK ((1 << OFFSET_BITS) - 1)
82#define HASH_ORDER (0xffull << (OFFSET_BITS + 24))
83loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
84{
85 loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
86 if (hash_order)
87 fpos |= HASH_ORDER;
88 return fpos;
89}
90
91static bool is_hash_order(loff_t p)
92{
93 return (p & HASH_ORDER) == HASH_ORDER;
94}
95
76static unsigned fpos_frag(loff_t p) 96static unsigned fpos_frag(loff_t p)
77{ 97{
78 return p >> 32; 98 return p >> OFFSET_BITS;
79} 99}
100
101static unsigned fpos_hash(loff_t p)
102{
103 return ceph_frag_value(fpos_frag(p));
104}
105
80static unsigned fpos_off(loff_t p) 106static unsigned fpos_off(loff_t p)
81{ 107{
82 return p & 0xffffffff; 108 return p & OFFSET_MASK;
83} 109}
84 110
85static int fpos_cmp(loff_t l, loff_t r) 111static int fpos_cmp(loff_t l, loff_t r)
@@ -111,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
111 return 0; 137 return 0;
112} 138}
113 139
140
141static struct dentry *
142__dcache_find_get_entry(struct dentry *parent, u64 idx,
143 struct ceph_readdir_cache_control *cache_ctl)
144{
145 struct inode *dir = d_inode(parent);
146 struct dentry *dentry;
147 unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
148 loff_t ptr_pos = idx * sizeof(struct dentry *);
149 pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
150
151 if (ptr_pos >= i_size_read(dir))
152 return NULL;
153
154 if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
155 ceph_readdir_cache_release(cache_ctl);
156 cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
157 if (!cache_ctl->page) {
158 dout(" page %lu not found\n", ptr_pgoff);
159 return ERR_PTR(-EAGAIN);
160 }
161 /* reading/filling the cache are serialized by
162 i_mutex, no need to use page lock */
163 unlock_page(cache_ctl->page);
164 cache_ctl->dentries = kmap(cache_ctl->page);
165 }
166
167 cache_ctl->index = idx & idx_mask;
168
169 rcu_read_lock();
170 spin_lock(&parent->d_lock);
171 /* check i_size again here, because empty directory can be
172 * marked as complete while not holding the i_mutex. */
173 if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
174 dentry = cache_ctl->dentries[cache_ctl->index];
175 else
176 dentry = NULL;
177 spin_unlock(&parent->d_lock);
178 if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
179 dentry = NULL;
180 rcu_read_unlock();
181 return dentry ? : ERR_PTR(-EAGAIN);
182}
183
114/* 184/*
115 * When possible, we try to satisfy a readdir by peeking at the 185 * When possible, we try to satisfy a readdir by peeking at the
116 * dcache. We make this work by carefully ordering dentries on 186 * dcache. We make this work by carefully ordering dentries on
@@ -130,75 +200,68 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
130 struct inode *dir = d_inode(parent); 200 struct inode *dir = d_inode(parent);
131 struct dentry *dentry, *last = NULL; 201 struct dentry *dentry, *last = NULL;
132 struct ceph_dentry_info *di; 202 struct ceph_dentry_info *di;
133 unsigned nsize = PAGE_SIZE / sizeof(struct dentry *);
134 int err = 0;
135 loff_t ptr_pos = 0;
136 struct ceph_readdir_cache_control cache_ctl = {}; 203 struct ceph_readdir_cache_control cache_ctl = {};
204 u64 idx = 0;
205 int err = 0;
137 206
138 dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); 207 dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos);
208
209 /* search start position */
210 if (ctx->pos > 2) {
211 u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
212 while (count > 0) {
213 u64 step = count >> 1;
214 dentry = __dcache_find_get_entry(parent, idx + step,
215 &cache_ctl);
216 if (!dentry) {
217 /* use linar search */
218 idx = 0;
219 break;
220 }
221 if (IS_ERR(dentry)) {
222 err = PTR_ERR(dentry);
223 goto out;
224 }
225 di = ceph_dentry(dentry);
226 spin_lock(&dentry->d_lock);
227 if (fpos_cmp(di->offset, ctx->pos) < 0) {
228 idx += step + 1;
229 count -= step + 1;
230 } else {
231 count = step;
232 }
233 spin_unlock(&dentry->d_lock);
234 dput(dentry);
235 }
139 236
140 /* we can calculate cache index for the first dirfrag */ 237 dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
141 if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
142 cache_ctl.index = fpos_off(ctx->pos) - 2;
143 BUG_ON(cache_ctl.index < 0);
144 ptr_pos = cache_ctl.index * sizeof(struct dentry *);
145 } 238 }
146 239
147 while (true) {
148 pgoff_t pgoff;
149 bool emit_dentry;
150 240
151 if (ptr_pos >= i_size_read(dir)) { 241 for (;;) {
242 bool emit_dentry = false;
243 dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
244 if (!dentry) {
152 fi->flags |= CEPH_F_ATEND; 245 fi->flags |= CEPH_F_ATEND;
153 err = 0; 246 err = 0;
154 break; 247 break;
155 } 248 }
156 249 if (IS_ERR(dentry)) {
157 err = -EAGAIN; 250 err = PTR_ERR(dentry);
158 pgoff = ptr_pos >> PAGE_SHIFT; 251 goto out;
159 if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
160 ceph_readdir_cache_release(&cache_ctl);
161 cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
162 if (!cache_ctl.page) {
163 dout(" page %lu not found\n", pgoff);
164 break;
165 }
166 /* reading/filling the cache are serialized by
167 * i_mutex, no need to use page lock */
168 unlock_page(cache_ctl.page);
169 cache_ctl.dentries = kmap(cache_ctl.page);
170 } 252 }
171 253
172 rcu_read_lock();
173 spin_lock(&parent->d_lock);
174 /* check i_size again here, because empty directory can be
175 * marked as complete while not holding the i_mutex. */
176 if (ceph_dir_is_complete_ordered(dir) &&
177 ptr_pos < i_size_read(dir))
178 dentry = cache_ctl.dentries[cache_ctl.index % nsize];
179 else
180 dentry = NULL;
181 spin_unlock(&parent->d_lock);
182 if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
183 dentry = NULL;
184 rcu_read_unlock();
185 if (!dentry)
186 break;
187
188 emit_dentry = false;
189 di = ceph_dentry(dentry); 254 di = ceph_dentry(dentry);
190 spin_lock(&dentry->d_lock); 255 spin_lock(&dentry->d_lock);
191 if (di->lease_shared_gen == shared_gen && 256 if (di->lease_shared_gen == shared_gen &&
192 d_really_is_positive(dentry) && 257 d_really_is_positive(dentry) &&
193 ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
194 ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
195 fpos_cmp(ctx->pos, di->offset) <= 0) { 258 fpos_cmp(ctx->pos, di->offset) <= 0) {
196 emit_dentry = true; 259 emit_dentry = true;
197 } 260 }
198 spin_unlock(&dentry->d_lock); 261 spin_unlock(&dentry->d_lock);
199 262
200 if (emit_dentry) { 263 if (emit_dentry) {
201 dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, 264 dout(" %llx dentry %p %pd %p\n", di->offset,
202 dentry, dentry, d_inode(dentry)); 265 dentry, dentry, d_inode(dentry));
203 ctx->pos = di->offset; 266 ctx->pos = di->offset;
204 if (!dir_emit(ctx, dentry->d_name.name, 267 if (!dir_emit(ctx, dentry->d_name.name,
@@ -218,10 +281,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
218 } else { 281 } else {
219 dput(dentry); 282 dput(dentry);
220 } 283 }
221
222 cache_ctl.index++;
223 ptr_pos += sizeof(struct dentry *);
224 } 284 }
285out:
225 ceph_readdir_cache_release(&cache_ctl); 286 ceph_readdir_cache_release(&cache_ctl);
226 if (last) { 287 if (last) {
227 int ret; 288 int ret;
@@ -235,6 +296,16 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
235 return err; 296 return err;
236} 297}
237 298
299static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
300{
301 if (!fi->last_readdir)
302 return true;
303 if (is_hash_order(pos))
304 return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
305 else
306 return fi->frag != fpos_frag(pos);
307}
308
238static int ceph_readdir(struct file *file, struct dir_context *ctx) 309static int ceph_readdir(struct file *file, struct dir_context *ctx)
239{ 310{
240 struct ceph_file_info *fi = file->private_data; 311 struct ceph_file_info *fi = file->private_data;
@@ -242,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
242 struct ceph_inode_info *ci = ceph_inode(inode); 313 struct ceph_inode_info *ci = ceph_inode(inode);
243 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 314 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
244 struct ceph_mds_client *mdsc = fsc->mdsc; 315 struct ceph_mds_client *mdsc = fsc->mdsc;
245 unsigned frag = fpos_frag(ctx->pos); 316 int i;
246 int off = fpos_off(ctx->pos);
247 int err; 317 int err;
248 u32 ftype; 318 u32 ftype;
249 struct ceph_mds_reply_info_parsed *rinfo; 319 struct ceph_mds_reply_info_parsed *rinfo;
250 320
251 dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); 321 dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
252 if (fi->flags & CEPH_F_ATEND) 322 if (fi->flags & CEPH_F_ATEND)
253 return 0; 323 return 0;
254 324
@@ -260,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
260 inode->i_mode >> 12)) 330 inode->i_mode >> 12))
261 return 0; 331 return 0;
262 ctx->pos = 1; 332 ctx->pos = 1;
263 off = 1;
264 } 333 }
265 if (ctx->pos == 1) { 334 if (ctx->pos == 1) {
266 ino_t ino = parent_ino(file->f_path.dentry); 335 ino_t ino = parent_ino(file->f_path.dentry);
@@ -270,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
270 inode->i_mode >> 12)) 339 inode->i_mode >> 12))
271 return 0; 340 return 0;
272 ctx->pos = 2; 341 ctx->pos = 2;
273 off = 2;
274 } 342 }
275 343
276 /* can we use the dcache? */ 344 /* can we use the dcache? */
@@ -285,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
285 err = __dcache_readdir(file, ctx, shared_gen); 353 err = __dcache_readdir(file, ctx, shared_gen);
286 if (err != -EAGAIN) 354 if (err != -EAGAIN)
287 return err; 355 return err;
288 frag = fpos_frag(ctx->pos);
289 off = fpos_off(ctx->pos);
290 } else { 356 } else {
291 spin_unlock(&ci->i_ceph_lock); 357 spin_unlock(&ci->i_ceph_lock);
292 } 358 }
@@ -294,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
294 /* proceed with a normal readdir */ 360 /* proceed with a normal readdir */
295more: 361more:
296 /* do we have the correct frag content buffered? */ 362 /* do we have the correct frag content buffered? */
297 if (fi->frag != frag || fi->last_readdir == NULL) { 363 if (need_send_readdir(fi, ctx->pos)) {
298 struct ceph_mds_request *req; 364 struct ceph_mds_request *req;
365 unsigned frag;
299 int op = ceph_snap(inode) == CEPH_SNAPDIR ? 366 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
300 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; 367 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
301 368
@@ -305,6 +372,13 @@ more:
305 fi->last_readdir = NULL; 372 fi->last_readdir = NULL;
306 } 373 }
307 374
375 if (is_hash_order(ctx->pos)) {
376 frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
377 NULL, NULL);
378 } else {
379 frag = fpos_frag(ctx->pos);
380 }
381
308 dout("readdir fetching %llx.%llx frag %x offset '%s'\n", 382 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
309 ceph_vinop(inode), frag, fi->last_name); 383 ceph_vinop(inode), frag, fi->last_name);
310 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 384 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -331,6 +405,8 @@ more:
331 req->r_readdir_cache_idx = fi->readdir_cache_idx; 405 req->r_readdir_cache_idx = fi->readdir_cache_idx;
332 req->r_readdir_offset = fi->next_offset; 406 req->r_readdir_offset = fi->next_offset;
333 req->r_args.readdir.frag = cpu_to_le32(frag); 407 req->r_args.readdir.frag = cpu_to_le32(frag);
408 req->r_args.readdir.flags =
409 cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
334 410
335 req->r_inode = inode; 411 req->r_inode = inode;
336 ihold(inode); 412 ihold(inode);
@@ -340,22 +416,26 @@ more:
340 ceph_mdsc_put_request(req); 416 ceph_mdsc_put_request(req);
341 return err; 417 return err;
342 } 418 }
343 dout("readdir got and parsed readdir result=%d" 419 dout("readdir got and parsed readdir result=%d on "
344 " on frag %x, end=%d, complete=%d\n", err, frag, 420 "frag %x, end=%d, complete=%d, hash_order=%d\n",
421 err, frag,
345 (int)req->r_reply_info.dir_end, 422 (int)req->r_reply_info.dir_end,
346 (int)req->r_reply_info.dir_complete); 423 (int)req->r_reply_info.dir_complete,
347 424 (int)req->r_reply_info.hash_order);
348 425
349 /* note next offset and last dentry name */
350 rinfo = &req->r_reply_info; 426 rinfo = &req->r_reply_info;
351 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { 427 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
352 frag = le32_to_cpu(rinfo->dir_dir->frag); 428 frag = le32_to_cpu(rinfo->dir_dir->frag);
353 off = req->r_readdir_offset; 429 if (!rinfo->hash_order) {
354 fi->next_offset = off; 430 fi->next_offset = req->r_readdir_offset;
431 /* adjust ctx->pos to beginning of frag */
432 ctx->pos = ceph_make_fpos(frag,
433 fi->next_offset,
434 false);
435 }
355 } 436 }
356 437
357 fi->frag = frag; 438 fi->frag = frag;
358 fi->offset = fi->next_offset;
359 fi->last_readdir = req; 439 fi->last_readdir = req;
360 440
361 if (req->r_did_prepopulate) { 441 if (req->r_did_prepopulate) {
@@ -363,7 +443,8 @@ more:
363 if (fi->readdir_cache_idx < 0) { 443 if (fi->readdir_cache_idx < 0) {
364 /* preclude from marking dir ordered */ 444 /* preclude from marking dir ordered */
365 fi->dir_ordered_count = 0; 445 fi->dir_ordered_count = 0;
366 } else if (ceph_frag_is_leftmost(frag) && off == 2) { 446 } else if (ceph_frag_is_leftmost(frag) &&
447 fi->next_offset == 2) {
367 /* note dir version at start of readdir so 448 /* note dir version at start of readdir so
368 * we can tell if any dentries get dropped */ 449 * we can tell if any dentries get dropped */
369 fi->dir_release_count = req->r_dir_release_cnt; 450 fi->dir_release_count = req->r_dir_release_cnt;
@@ -377,65 +458,87 @@ more:
377 fi->dir_release_count = 0; 458 fi->dir_release_count = 0;
378 } 459 }
379 460
380 if (req->r_reply_info.dir_end) { 461 /* note next offset and last dentry name */
381 kfree(fi->last_name); 462 if (rinfo->dir_nr > 0) {
382 fi->last_name = NULL; 463 struct ceph_mds_reply_dir_entry *rde =
383 if (ceph_frag_is_rightmost(frag)) 464 rinfo->dir_entries + (rinfo->dir_nr-1);
384 fi->next_offset = 2; 465 unsigned next_offset = req->r_reply_info.dir_end ?
385 else 466 2 : (fpos_off(rde->offset) + 1);
386 fi->next_offset = 0; 467 err = note_last_dentry(fi, rde->name, rde->name_len,
387 } else { 468 next_offset);
388 err = note_last_dentry(fi,
389 rinfo->dir_dname[rinfo->dir_nr-1],
390 rinfo->dir_dname_len[rinfo->dir_nr-1],
391 fi->next_offset + rinfo->dir_nr);
392 if (err) 469 if (err)
393 return err; 470 return err;
471 } else if (req->r_reply_info.dir_end) {
472 fi->next_offset = 2;
473 /* keep last name */
394 } 474 }
395 } 475 }
396 476
397 rinfo = &fi->last_readdir->r_reply_info; 477 rinfo = &fi->last_readdir->r_reply_info;
398 dout("readdir frag %x num %d off %d chunkoff %d\n", frag, 478 dout("readdir frag %x num %d pos %llx chunk first %llx\n",
399 rinfo->dir_nr, off, fi->offset); 479 fi->frag, rinfo->dir_nr, ctx->pos,
400 480 rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
401 ctx->pos = ceph_make_fpos(frag, off); 481
402 while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { 482 i = 0;
403 struct ceph_mds_reply_inode *in = 483 /* search start position */
404 rinfo->dir_in[off - fi->offset].in; 484 if (rinfo->dir_nr > 0) {
485 int step, nr = rinfo->dir_nr;
486 while (nr > 0) {
487 step = nr >> 1;
488 if (rinfo->dir_entries[i + step].offset < ctx->pos) {
489 i += step + 1;
490 nr -= step + 1;
491 } else {
492 nr = step;
493 }
494 }
495 }
496 for (; i < rinfo->dir_nr; i++) {
497 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
405 struct ceph_vino vino; 498 struct ceph_vino vino;
406 ino_t ino; 499 ino_t ino;
407 500
408 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", 501 BUG_ON(rde->offset < ctx->pos);
409 off, off - fi->offset, rinfo->dir_nr, ctx->pos, 502
410 rinfo->dir_dname_len[off - fi->offset], 503 ctx->pos = rde->offset;
411 rinfo->dir_dname[off - fi->offset], in); 504 dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
412 BUG_ON(!in); 505 i, rinfo->dir_nr, ctx->pos,
413 ftype = le32_to_cpu(in->mode) >> 12; 506 rde->name_len, rde->name, &rde->inode.in);
414 vino.ino = le64_to_cpu(in->ino); 507
415 vino.snap = le64_to_cpu(in->snapid); 508 BUG_ON(!rde->inode.in);
509 ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
510 vino.ino = le64_to_cpu(rde->inode.in->ino);
511 vino.snap = le64_to_cpu(rde->inode.in->snapid);
416 ino = ceph_vino_to_ino(vino); 512 ino = ceph_vino_to_ino(vino);
417 if (!dir_emit(ctx, 513
418 rinfo->dir_dname[off - fi->offset], 514 if (!dir_emit(ctx, rde->name, rde->name_len,
419 rinfo->dir_dname_len[off - fi->offset], 515 ceph_translate_ino(inode->i_sb, ino), ftype)) {
420 ceph_translate_ino(inode->i_sb, ino), ftype)) {
421 dout("filldir stopping us...\n"); 516 dout("filldir stopping us...\n");
422 return 0; 517 return 0;
423 } 518 }
424 off++;
425 ctx->pos++; 519 ctx->pos++;
426 } 520 }
427 521
428 if (fi->last_name) { 522 if (fi->next_offset > 2) {
429 ceph_mdsc_put_request(fi->last_readdir); 523 ceph_mdsc_put_request(fi->last_readdir);
430 fi->last_readdir = NULL; 524 fi->last_readdir = NULL;
431 goto more; 525 goto more;
432 } 526 }
433 527
434 /* more frags? */ 528 /* more frags? */
435 if (!ceph_frag_is_rightmost(frag)) { 529 if (!ceph_frag_is_rightmost(fi->frag)) {
436 frag = ceph_frag_next(frag); 530 unsigned frag = ceph_frag_next(fi->frag);
437 off = 0; 531 if (is_hash_order(ctx->pos)) {
438 ctx->pos = ceph_make_fpos(frag, off); 532 loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
533 fi->next_offset, true);
534 if (new_pos > ctx->pos)
535 ctx->pos = new_pos;
536 /* keep last_name */
537 } else {
538 ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
539 kfree(fi->last_name);
540 fi->last_name = NULL;
541 }
439 dout("readdir next frag is %x\n", frag); 542 dout("readdir next frag is %x\n", frag);
440 goto more; 543 goto more;
441 } 544 }
@@ -467,7 +570,7 @@ more:
467 return 0; 570 return 0;
468} 571}
469 572
470static void reset_readdir(struct ceph_file_info *fi, unsigned frag) 573static void reset_readdir(struct ceph_file_info *fi)
471{ 574{
472 if (fi->last_readdir) { 575 if (fi->last_readdir) {
473 ceph_mdsc_put_request(fi->last_readdir); 576 ceph_mdsc_put_request(fi->last_readdir);
@@ -477,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
477 fi->last_name = NULL; 580 fi->last_name = NULL;
478 fi->dir_release_count = 0; 581 fi->dir_release_count = 0;
479 fi->readdir_cache_idx = -1; 582 fi->readdir_cache_idx = -1;
480 if (ceph_frag_is_leftmost(frag)) 583 fi->next_offset = 2; /* compensate for . and .. */
481 fi->next_offset = 2; /* compensate for . and .. */
482 else
483 fi->next_offset = 0;
484 fi->flags &= ~CEPH_F_ATEND; 584 fi->flags &= ~CEPH_F_ATEND;
485} 585}
486 586
587/*
588 * discard buffered readdir content on seekdir(0), or seek to new frag,
589 * or seek prior to current chunk
590 */
591static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
592{
593 struct ceph_mds_reply_info_parsed *rinfo;
594 loff_t chunk_offset;
595 if (new_pos == 0)
596 return true;
597 if (is_hash_order(new_pos)) {
598 /* no need to reset last_name for a forward seek when
599 * dentries are sotred in hash order */
600 } else if (fi->frag |= fpos_frag(new_pos)) {
601 return true;
602 }
603 rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
604 if (!rinfo || !rinfo->dir_nr)
605 return true;
606 chunk_offset = rinfo->dir_entries[0].offset;
607 return new_pos < chunk_offset ||
608 is_hash_order(new_pos) != is_hash_order(chunk_offset);
609}
610
487static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) 611static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
488{ 612{
489 struct ceph_file_info *fi = file->private_data; 613 struct ceph_file_info *fi = file->private_data;
490 struct inode *inode = file->f_mapping->host; 614 struct inode *inode = file->f_mapping->host;
491 loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
492 loff_t retval; 615 loff_t retval;
493 616
494 inode_lock(inode); 617 inode_lock(inode);
@@ -505,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
505 } 628 }
506 629
507 if (offset >= 0) { 630 if (offset >= 0) {
631 if (need_reset_readdir(fi, offset)) {
632 dout("dir_llseek dropping %p content\n", file);
633 reset_readdir(fi);
634 } else if (is_hash_order(offset) && offset > file->f_pos) {
635 /* for hash offset, we don't know if a forward seek
636 * is within same frag */
637 fi->dir_release_count = 0;
638 fi->readdir_cache_idx = -1;
639 }
640
508 if (offset != file->f_pos) { 641 if (offset != file->f_pos) {
509 file->f_pos = offset; 642 file->f_pos = offset;
510 file->f_version = 0; 643 file->f_version = 0;
511 fi->flags &= ~CEPH_F_ATEND; 644 fi->flags &= ~CEPH_F_ATEND;
512 } 645 }
513 retval = offset; 646 retval = offset;
514
515 if (offset == 0 ||
516 fpos_frag(offset) != fi->frag ||
517 fpos_off(offset) < fi->offset) {
518 /* discard buffered readdir content on seekdir(0), or
519 * seek to new frag, or seek prior to current chunk */
520 dout("dir_llseek dropping %p content\n", file);
521 reset_readdir(fi, fpos_frag(offset));
522 } else if (fpos_cmp(offset, old_offset) > 0) {
523 /* reset dir_release_count if we did a forward seek */
524 fi->dir_release_count = 0;
525 fi->readdir_cache_idx = -1;
526 }
527 } 647 }
528out: 648out:
529 inode_unlock(inode); 649 inode_unlock(inode);
@@ -591,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
591 return dentry; 711 return dentry;
592} 712}
593 713
594static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) 714static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
595{ 715{
596 return ceph_ino(inode) == CEPH_INO_ROOT && 716 return ceph_ino(inode) == CEPH_INO_ROOT &&
597 strncmp(dentry->d_name.name, ".ceph", 5) == 0; 717 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 4f1dc7120916..a888df6f2d71 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -192,6 +192,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
192} 192}
193 193
194/* 194/*
195 * try renew caps after session gets killed.
196 */
197int ceph_renew_caps(struct inode *inode)
198{
199 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
200 struct ceph_inode_info *ci = ceph_inode(inode);
201 struct ceph_mds_request *req;
202 int err, flags, wanted;
203
204 spin_lock(&ci->i_ceph_lock);
205 wanted = __ceph_caps_file_wanted(ci);
206 if (__ceph_is_any_real_caps(ci) &&
207 (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) {
208 int issued = __ceph_caps_issued(ci, NULL);
209 spin_unlock(&ci->i_ceph_lock);
210 dout("renew caps %p want %s issued %s updating mds_wanted\n",
211 inode, ceph_cap_string(wanted), ceph_cap_string(issued));
212 ceph_check_caps(ci, 0, NULL);
213 return 0;
214 }
215 spin_unlock(&ci->i_ceph_lock);
216
217 flags = 0;
218 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
219 flags = O_RDWR;
220 else if (wanted & CEPH_CAP_FILE_RD)
221 flags = O_RDONLY;
222 else if (wanted & CEPH_CAP_FILE_WR)
223 flags = O_WRONLY;
224#ifdef O_LAZY
225 if (wanted & CEPH_CAP_FILE_LAZYIO)
226 flags |= O_LAZY;
227#endif
228
229 req = prepare_open_request(inode->i_sb, flags, 0);
230 if (IS_ERR(req)) {
231 err = PTR_ERR(req);
232 goto out;
233 }
234
235 req->r_inode = inode;
236 ihold(inode);
237 req->r_num_caps = 1;
238 req->r_fmode = -1;
239
240 err = ceph_mdsc_do_request(mdsc, NULL, req);
241 ceph_mdsc_put_request(req);
242out:
243 dout("renew caps %p open result=%d\n", inode, err);
244 return err < 0 ? err : 0;
245}
246
247/*
195 * If we already have the requisite capabilities, we can satisfy 248 * If we already have the requisite capabilities, we can satisfy
196 * the open request locally (no need to request new caps from the 249 * the open request locally (no need to request new caps from the
197 * MDS). We do, however, need to inform the MDS (asynchronously) 250 * MDS). We do, however, need to inform the MDS (asynchronously)
@@ -616,8 +669,7 @@ static void ceph_aio_complete(struct inode *inode,
616 kfree(aio_req); 669 kfree(aio_req);
617} 670}
618 671
619static void ceph_aio_complete_req(struct ceph_osd_request *req, 672static void ceph_aio_complete_req(struct ceph_osd_request *req)
620 struct ceph_msg *msg)
621{ 673{
622 int rc = req->r_result; 674 int rc = req->r_result;
623 struct inode *inode = req->r_inode; 675 struct inode *inode = req->r_inode;
@@ -714,14 +766,21 @@ static void ceph_aio_retry_work(struct work_struct *work)
714 req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | 766 req->r_flags = CEPH_OSD_FLAG_ORDERSNAP |
715 CEPH_OSD_FLAG_ONDISK | 767 CEPH_OSD_FLAG_ONDISK |
716 CEPH_OSD_FLAG_WRITE; 768 CEPH_OSD_FLAG_WRITE;
717 req->r_base_oloc = orig_req->r_base_oloc; 769 ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
718 req->r_base_oid = orig_req->r_base_oid; 770 ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
771
772 ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
773 if (ret) {
774 ceph_osdc_put_request(req);
775 req = orig_req;
776 goto out;
777 }
719 778
720 req->r_ops[0] = orig_req->r_ops[0]; 779 req->r_ops[0] = orig_req->r_ops[0];
721 osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); 780 osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
722 781
723 ceph_osdc_build_request(req, req->r_ops[0].extent.offset, 782 req->r_mtime = aio_req->mtime;
724 snapc, CEPH_NOSNAP, &aio_req->mtime); 783 req->r_data_offset = req->r_ops[0].extent.offset;
725 784
726 ceph_osdc_put_request(orig_req); 785 ceph_osdc_put_request(orig_req);
727 786
@@ -733,7 +792,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
733out: 792out:
734 if (ret < 0) { 793 if (ret < 0) {
735 req->r_result = ret; 794 req->r_result = ret;
736 ceph_aio_complete_req(req, NULL); 795 ceph_aio_complete_req(req);
737 } 796 }
738 797
739 ceph_put_snap_context(snapc); 798 ceph_put_snap_context(snapc);
@@ -764,6 +823,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
764 list_add_tail(&req->r_unsafe_item, 823 list_add_tail(&req->r_unsafe_item,
765 &ci->i_unsafe_writes); 824 &ci->i_unsafe_writes);
766 spin_unlock(&ci->i_unsafe_lock); 825 spin_unlock(&ci->i_unsafe_lock);
826
827 complete_all(&req->r_completion);
767 } else { 828 } else {
768 spin_lock(&ci->i_unsafe_lock); 829 spin_lock(&ci->i_unsafe_lock);
769 list_del_init(&req->r_unsafe_item); 830 list_del_init(&req->r_unsafe_item);
@@ -875,14 +936,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
875 (pos+len) | (PAGE_SIZE - 1)); 936 (pos+len) | (PAGE_SIZE - 1));
876 937
877 osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); 938 osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
939 req->r_mtime = mtime;
878 } 940 }
879 941
880
881 osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, 942 osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
882 false, false); 943 false, false);
883 944
884 ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
885
886 if (aio_req) { 945 if (aio_req) {
887 aio_req->total_len += len; 946 aio_req->total_len += len;
888 aio_req->num_reqs++; 947 aio_req->num_reqs++;
@@ -956,7 +1015,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
956 req, false); 1015 req, false);
957 if (ret < 0) { 1016 if (ret < 0) {
958 req->r_result = ret; 1017 req->r_result = ret;
959 ceph_aio_complete_req(req, NULL); 1018 ceph_aio_complete_req(req);
960 } 1019 }
961 } 1020 }
962 return -EIOCBQUEUED; 1021 return -EIOCBQUEUED;
@@ -1067,9 +1126,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
1067 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, 1126 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
1068 false, true); 1127 false, true);
1069 1128
1070 /* BUG_ON(vino.snap != CEPH_NOSNAP); */ 1129 req->r_mtime = mtime;
1071 ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
1072
1073 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1130 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
1074 if (!ret) 1131 if (!ret)
1075 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1132 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1524,9 +1581,7 @@ static int ceph_zero_partial_object(struct inode *inode,
1524 goto out; 1581 goto out;
1525 } 1582 }
1526 1583
1527 ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, 1584 req->r_mtime = inode->i_mtime;
1528 &inode->i_mtime);
1529
1530 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1585 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
1531 if (!ret) { 1586 if (!ret) {
1532 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1587 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e669cfa9d793..f059b5997072 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -11,6 +11,7 @@
11#include <linux/xattr.h> 11#include <linux/xattr.h>
12#include <linux/posix_acl.h> 12#include <linux/posix_acl.h>
13#include <linux/random.h> 13#include <linux/random.h>
14#include <linux/sort.h>
14 15
15#include "super.h" 16#include "super.h"
16#include "mds_client.h" 17#include "mds_client.h"
@@ -254,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
254 diri_auth = ci->i_auth_cap->mds; 255 diri_auth = ci->i_auth_cap->mds;
255 spin_unlock(&ci->i_ceph_lock); 256 spin_unlock(&ci->i_ceph_lock);
256 257
258 if (mds == -1) /* CDIR_AUTH_PARENT */
259 mds = diri_auth;
260
257 mutex_lock(&ci->i_fragtree_mutex); 261 mutex_lock(&ci->i_fragtree_mutex);
258 if (ndist == 0 && mds == diri_auth) { 262 if (ndist == 0 && mds == diri_auth) {
259 /* no delegation info needed. */ 263 /* no delegation info needed. */
@@ -300,20 +304,38 @@ out:
300 return err; 304 return err;
301} 305}
302 306
307static int frag_tree_split_cmp(const void *l, const void *r)
308{
309 struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
310 struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
311 return ceph_frag_compare(ls->frag, rs->frag);
312}
313
314static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
315{
316 if (!frag)
317 return f == ceph_frag_make(0, 0);
318 if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
319 return false;
320 return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
321}
322
303static int ceph_fill_fragtree(struct inode *inode, 323static int ceph_fill_fragtree(struct inode *inode,
304 struct ceph_frag_tree_head *fragtree, 324 struct ceph_frag_tree_head *fragtree,
305 struct ceph_mds_reply_dirfrag *dirinfo) 325 struct ceph_mds_reply_dirfrag *dirinfo)
306{ 326{
307 struct ceph_inode_info *ci = ceph_inode(inode); 327 struct ceph_inode_info *ci = ceph_inode(inode);
308 struct ceph_inode_frag *frag; 328 struct ceph_inode_frag *frag, *prev_frag = NULL;
309 struct rb_node *rb_node; 329 struct rb_node *rb_node;
310 int i; 330 unsigned i, split_by, nsplits;
311 u32 id, nsplits; 331 u32 id;
312 bool update = false; 332 bool update = false;
313 333
314 mutex_lock(&ci->i_fragtree_mutex); 334 mutex_lock(&ci->i_fragtree_mutex);
315 nsplits = le32_to_cpu(fragtree->nsplits); 335 nsplits = le32_to_cpu(fragtree->nsplits);
316 if (nsplits) { 336 if (nsplits != ci->i_fragtree_nsplits) {
337 update = true;
338 } else if (nsplits) {
317 i = prandom_u32() % nsplits; 339 i = prandom_u32() % nsplits;
318 id = le32_to_cpu(fragtree->splits[i].frag); 340 id = le32_to_cpu(fragtree->splits[i].frag);
319 if (!__ceph_find_frag(ci, id)) 341 if (!__ceph_find_frag(ci, id))
@@ -332,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode,
332 if (!update) 354 if (!update)
333 goto out_unlock; 355 goto out_unlock;
334 356
357 if (nsplits > 1) {
358 sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
359 frag_tree_split_cmp, NULL);
360 }
361
335 dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); 362 dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
336 rb_node = rb_first(&ci->i_fragtree); 363 rb_node = rb_first(&ci->i_fragtree);
337 for (i = 0; i < nsplits; i++) { 364 for (i = 0; i < nsplits; i++) {
338 id = le32_to_cpu(fragtree->splits[i].frag); 365 id = le32_to_cpu(fragtree->splits[i].frag);
366 split_by = le32_to_cpu(fragtree->splits[i].by);
367 if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
368 pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
369 "frag %x split by %d\n", ceph_vinop(inode),
370 i, nsplits, id, split_by);
371 continue;
372 }
339 frag = NULL; 373 frag = NULL;
340 while (rb_node) { 374 while (rb_node) {
341 frag = rb_entry(rb_node, struct ceph_inode_frag, node); 375 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
@@ -347,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode,
347 break; 381 break;
348 } 382 }
349 rb_node = rb_next(rb_node); 383 rb_node = rb_next(rb_node);
350 rb_erase(&frag->node, &ci->i_fragtree); 384 /* delete stale split/leaf node */
351 kfree(frag); 385 if (frag->split_by > 0 ||
386 !is_frag_child(frag->frag, prev_frag)) {
387 rb_erase(&frag->node, &ci->i_fragtree);
388 if (frag->split_by > 0)
389 ci->i_fragtree_nsplits--;
390 kfree(frag);
391 }
352 frag = NULL; 392 frag = NULL;
353 } 393 }
354 if (!frag) { 394 if (!frag) {
@@ -356,14 +396,23 @@ static int ceph_fill_fragtree(struct inode *inode,
356 if (IS_ERR(frag)) 396 if (IS_ERR(frag))
357 continue; 397 continue;
358 } 398 }
359 frag->split_by = le32_to_cpu(fragtree->splits[i].by); 399 if (frag->split_by == 0)
400 ci->i_fragtree_nsplits++;
401 frag->split_by = split_by;
360 dout(" frag %x split by %d\n", frag->frag, frag->split_by); 402 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
403 prev_frag = frag;
361 } 404 }
362 while (rb_node) { 405 while (rb_node) {
363 frag = rb_entry(rb_node, struct ceph_inode_frag, node); 406 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
364 rb_node = rb_next(rb_node); 407 rb_node = rb_next(rb_node);
365 rb_erase(&frag->node, &ci->i_fragtree); 408 /* delete stale split/leaf node */
366 kfree(frag); 409 if (frag->split_by > 0 ||
410 !is_frag_child(frag->frag, prev_frag)) {
411 rb_erase(&frag->node, &ci->i_fragtree);
412 if (frag->split_by > 0)
413 ci->i_fragtree_nsplits--;
414 kfree(frag);
415 }
367 } 416 }
368out_unlock: 417out_unlock:
369 mutex_unlock(&ci->i_fragtree_mutex); 418 mutex_unlock(&ci->i_fragtree_mutex);
@@ -513,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode)
513 rb_erase(n, &ci->i_fragtree); 562 rb_erase(n, &ci->i_fragtree);
514 kfree(frag); 563 kfree(frag);
515 } 564 }
565 ci->i_fragtree_nsplits = 0;
516 566
517 __ceph_destroy_xattrs(ci); 567 __ceph_destroy_xattrs(ci);
518 if (ci->i_xattrs.blob) 568 if (ci->i_xattrs.blob)
@@ -533,6 +583,11 @@ int ceph_drop_inode(struct inode *inode)
533 return 1; 583 return 1;
534} 584}
535 585
586static inline blkcnt_t calc_inode_blocks(u64 size)
587{
588 return (size + (1<<9) - 1) >> 9;
589}
590
536/* 591/*
537 * Helpers to fill in size, ctime, mtime, and atime. We have to be 592 * Helpers to fill in size, ctime, mtime, and atime. We have to be
538 * careful because either the client or MDS may have more up to date 593 * careful because either the client or MDS may have more up to date
@@ -555,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
555 size = 0; 610 size = 0;
556 } 611 }
557 i_size_write(inode, size); 612 i_size_write(inode, size);
558 inode->i_blocks = (size + (1<<9) - 1) >> 9; 613 inode->i_blocks = calc_inode_blocks(size);
559 ci->i_reported_size = size; 614 ci->i_reported_size = size;
560 if (truncate_seq != ci->i_truncate_seq) { 615 if (truncate_seq != ci->i_truncate_seq) {
561 dout("truncate_seq %u -> %u\n", 616 dout("truncate_seq %u -> %u\n",
@@ -814,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
814 869
815 spin_unlock(&ci->i_ceph_lock); 870 spin_unlock(&ci->i_ceph_lock);
816 871
817 err = -EINVAL; 872 if (symlen != i_size_read(inode)) {
818 if (WARN_ON(symlen != i_size_read(inode))) 873 pr_err("fill_inode %llx.%llx BAD symlink "
819 goto out; 874 "size %lld\n", ceph_vinop(inode),
875 i_size_read(inode));
876 i_size_write(inode, symlen);
877 inode->i_blocks = calc_inode_blocks(symlen);
878 }
820 879
821 err = -ENOMEM; 880 err = -ENOMEM;
822 sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); 881 sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
@@ -1309,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
1309 int i, err = 0; 1368 int i, err = 0;
1310 1369
1311 for (i = 0; i < rinfo->dir_nr; i++) { 1370 for (i = 0; i < rinfo->dir_nr; i++) {
1371 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
1312 struct ceph_vino vino; 1372 struct ceph_vino vino;
1313 struct inode *in; 1373 struct inode *in;
1314 int rc; 1374 int rc;
1315 1375
1316 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); 1376 vino.ino = le64_to_cpu(rde->inode.in->ino);
1317 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); 1377 vino.snap = le64_to_cpu(rde->inode.in->snapid);
1318 1378
1319 in = ceph_get_inode(req->r_dentry->d_sb, vino); 1379 in = ceph_get_inode(req->r_dentry->d_sb, vino);
1320 if (IS_ERR(in)) { 1380 if (IS_ERR(in)) {
@@ -1322,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
1322 dout("new_inode badness got %d\n", err); 1382 dout("new_inode badness got %d\n", err);
1323 continue; 1383 continue;
1324 } 1384 }
1325 rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, 1385 rc = fill_inode(in, NULL, &rde->inode, NULL, session,
1326 req->r_request_started, -1, 1386 req->r_request_started, -1,
1327 &req->r_caps_reservation); 1387 &req->r_caps_reservation);
1328 if (rc < 0) { 1388 if (rc < 0) {
1329 pr_err("fill_inode badness on %p got %d\n", in, rc); 1389 pr_err("fill_inode badness on %p got %d\n", in, rc);
1330 err = rc; 1390 err = rc;
1331 continue;
1332 } 1391 }
1392 iput(in);
1333 } 1393 }
1334 1394
1335 return err; 1395 return err;
@@ -1387,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1387 struct ceph_mds_session *session) 1447 struct ceph_mds_session *session)
1388{ 1448{
1389 struct dentry *parent = req->r_dentry; 1449 struct dentry *parent = req->r_dentry;
1450 struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
1390 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; 1451 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1391 struct qstr dname; 1452 struct qstr dname;
1392 struct dentry *dn; 1453 struct dentry *dn;
@@ -1394,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1394 int err = 0, skipped = 0, ret, i; 1455 int err = 0, skipped = 0, ret, i;
1395 struct inode *snapdir = NULL; 1456 struct inode *snapdir = NULL;
1396 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; 1457 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1397 struct ceph_dentry_info *di;
1398 u32 frag = le32_to_cpu(rhead->args.readdir.frag); 1458 u32 frag = le32_to_cpu(rhead->args.readdir.frag);
1459 u32 last_hash = 0;
1460 u32 fpos_offset;
1399 struct ceph_readdir_cache_control cache_ctl = {}; 1461 struct ceph_readdir_cache_control cache_ctl = {};
1400 1462
1401 if (req->r_aborted) 1463 if (req->r_aborted)
1402 return readdir_prepopulate_inodes_only(req, session); 1464 return readdir_prepopulate_inodes_only(req, session);
1403 1465
1466 if (rinfo->hash_order && req->r_path2) {
1467 last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
1468 req->r_path2, strlen(req->r_path2));
1469 last_hash = ceph_frag_value(last_hash);
1470 }
1471
1404 if (rinfo->dir_dir && 1472 if (rinfo->dir_dir &&
1405 le32_to_cpu(rinfo->dir_dir->frag) != frag) { 1473 le32_to_cpu(rinfo->dir_dir->frag) != frag) {
1406 dout("readdir_prepopulate got new frag %x -> %x\n", 1474 dout("readdir_prepopulate got new frag %x -> %x\n",
1407 frag, le32_to_cpu(rinfo->dir_dir->frag)); 1475 frag, le32_to_cpu(rinfo->dir_dir->frag));
1408 frag = le32_to_cpu(rinfo->dir_dir->frag); 1476 frag = le32_to_cpu(rinfo->dir_dir->frag);
1409 if (ceph_frag_is_leftmost(frag)) 1477 if (!rinfo->hash_order)
1410 req->r_readdir_offset = 2; 1478 req->r_readdir_offset = 2;
1411 else
1412 req->r_readdir_offset = 0;
1413 } 1479 }
1414 1480
1415 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { 1481 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
@@ -1427,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1427 if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { 1493 if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
1428 /* note dir version at start of readdir so we can tell 1494 /* note dir version at start of readdir so we can tell
1429 * if any dentries get dropped */ 1495 * if any dentries get dropped */
1430 struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
1431 req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); 1496 req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
1432 req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); 1497 req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
1433 req->r_readdir_cache_idx = 0; 1498 req->r_readdir_cache_idx = 0;
1434 } 1499 }
1435 1500
1436 cache_ctl.index = req->r_readdir_cache_idx; 1501 cache_ctl.index = req->r_readdir_cache_idx;
1502 fpos_offset = req->r_readdir_offset;
1437 1503
1438 /* FIXME: release caps/leases if error occurs */ 1504 /* FIXME: release caps/leases if error occurs */
1439 for (i = 0; i < rinfo->dir_nr; i++) { 1505 for (i = 0; i < rinfo->dir_nr; i++) {
1506 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
1440 struct ceph_vino vino; 1507 struct ceph_vino vino;
1441 1508
1442 dname.name = rinfo->dir_dname[i]; 1509 dname.name = rde->name;
1443 dname.len = rinfo->dir_dname_len[i]; 1510 dname.len = rde->name_len;
1444 dname.hash = full_name_hash(dname.name, dname.len); 1511 dname.hash = full_name_hash(dname.name, dname.len);
1445 1512
1446 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); 1513 vino.ino = le64_to_cpu(rde->inode.in->ino);
1447 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); 1514 vino.snap = le64_to_cpu(rde->inode.in->snapid);
1515
1516 if (rinfo->hash_order) {
1517 u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
1518 rde->name, rde->name_len);
1519 hash = ceph_frag_value(hash);
1520 if (hash != last_hash)
1521 fpos_offset = 2;
1522 last_hash = hash;
1523 rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
1524 } else {
1525 rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
1526 }
1448 1527
1449retry_lookup: 1528retry_lookup:
1450 dn = d_lookup(parent, &dname); 1529 dn = d_lookup(parent, &dname);
@@ -1490,7 +1569,7 @@ retry_lookup:
1490 } 1569 }
1491 } 1570 }
1492 1571
1493 ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, 1572 ret = fill_inode(in, NULL, &rde->inode, NULL, session,
1494 req->r_request_started, -1, 1573 req->r_request_started, -1,
1495 &req->r_caps_reservation); 1574 &req->r_caps_reservation);
1496 if (ret < 0) { 1575 if (ret < 0) {
@@ -1523,11 +1602,9 @@ retry_lookup:
1523 dn = realdn; 1602 dn = realdn;
1524 } 1603 }
1525 1604
1526 di = dn->d_fsdata; 1605 ceph_dentry(dn)->offset = rde->offset;
1527 di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
1528 1606
1529 update_dentry_lease(dn, rinfo->dir_dlease[i], 1607 update_dentry_lease(dn, rde->lease, req->r_session,
1530 req->r_session,
1531 req->r_request_started); 1608 req->r_request_started);
1532 1609
1533 if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { 1610 if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
@@ -1562,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
1562 spin_lock(&ci->i_ceph_lock); 1639 spin_lock(&ci->i_ceph_lock);
1563 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); 1640 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
1564 i_size_write(inode, size); 1641 i_size_write(inode, size);
1565 inode->i_blocks = (size + (1 << 9) - 1) >> 9; 1642 inode->i_blocks = calc_inode_blocks(size);
1566 1643
1567 /* tell the MDS if we are approaching max_size */ 1644 /* tell the MDS if we are approaching max_size */
1568 if ((size << 1) >= ci->i_max_size && 1645 if ((size << 1) >= ci->i_max_size &&
@@ -1624,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work)
1624 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, 1701 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1625 i_pg_inv_work); 1702 i_pg_inv_work);
1626 struct inode *inode = &ci->vfs_inode; 1703 struct inode *inode = &ci->vfs_inode;
1704 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1627 u32 orig_gen; 1705 u32 orig_gen;
1628 int check = 0; 1706 int check = 0;
1629 1707
1630 mutex_lock(&ci->i_truncate_mutex); 1708 mutex_lock(&ci->i_truncate_mutex);
1709
1710 if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
1711 pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
1712 inode, ceph_ino(inode));
1713 mapping_set_error(inode->i_mapping, -EIO);
1714 truncate_pagecache(inode, 0);
1715 mutex_unlock(&ci->i_truncate_mutex);
1716 goto out;
1717 }
1718
1631 spin_lock(&ci->i_ceph_lock); 1719 spin_lock(&ci->i_ceph_lock);
1632 dout("invalidate_pages %p gen %d revoking %d\n", inode, 1720 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1633 ci->i_rdcache_gen, ci->i_rdcache_revoking); 1721 ci->i_rdcache_gen, ci->i_rdcache_revoking);
@@ -1641,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work)
1641 orig_gen = ci->i_rdcache_gen; 1729 orig_gen = ci->i_rdcache_gen;
1642 spin_unlock(&ci->i_ceph_lock); 1730 spin_unlock(&ci->i_ceph_lock);
1643 1731
1644 truncate_pagecache(inode, 0); 1732 if (invalidate_inode_pages2(inode->i_mapping) < 0) {
1733 pr_err("invalidate_pages %p fails\n", inode);
1734 }
1645 1735
1646 spin_lock(&ci->i_ceph_lock); 1736 spin_lock(&ci->i_ceph_lock);
1647 if (orig_gen == ci->i_rdcache_gen && 1737 if (orig_gen == ci->i_rdcache_gen &&
@@ -1920,8 +2010,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
1920 if ((issued & CEPH_CAP_FILE_EXCL) && 2010 if ((issued & CEPH_CAP_FILE_EXCL) &&
1921 attr->ia_size > inode->i_size) { 2011 attr->ia_size > inode->i_size) {
1922 i_size_write(inode, attr->ia_size); 2012 i_size_write(inode, attr->ia_size);
1923 inode->i_blocks = 2013 inode->i_blocks = calc_inode_blocks(attr->ia_size);
1924 (attr->ia_size + (1 << 9) - 1) >> 9;
1925 inode->i_ctime = attr->ia_ctime; 2014 inode->i_ctime = attr->ia_ctime;
1926 ci->i_reported_size = attr->ia_size; 2015 ci->i_reported_size = attr->ia_size;
1927 dirtied |= CEPH_CAP_FILE_EXCL; 2016 dirtied |= CEPH_CAP_FILE_EXCL;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index f851d8d70158..be6b1657b1af 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
193 if (copy_from_user(&dl, arg, sizeof(dl))) 193 if (copy_from_user(&dl, arg, sizeof(dl)))
194 return -EFAULT; 194 return -EFAULT;
195 195
196 down_read(&osdc->map_sem); 196 down_read(&osdc->lock);
197 r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, 197 r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
198 &dl.object_no, &dl.object_offset, 198 &dl.object_no, &dl.object_offset,
199 &olen); 199 &olen);
200 if (r < 0) { 200 if (r < 0) {
201 up_read(&osdc->map_sem); 201 up_read(&osdc->lock);
202 return -EIO; 202 return -EIO;
203 } 203 }
204 dl.file_offset -= dl.object_offset; 204 dl.file_offset -= dl.object_offset;
@@ -213,15 +213,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
213 ceph_ino(inode), dl.object_no); 213 ceph_ino(inode), dl.object_no);
214 214
215 oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); 215 oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
216 ceph_oid_set_name(&oid, dl.object_name); 216 ceph_oid_printf(&oid, "%s", dl.object_name);
217 217
218 r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); 218 r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
219 if (r < 0) { 219 if (r < 0) {
220 up_read(&osdc->map_sem); 220 up_read(&osdc->lock);
221 return r; 221 return r;
222 } 222 }
223 223
224 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); 224 dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid);
225 if (dl.osd >= 0) { 225 if (dl.osd >= 0) {
226 struct ceph_entity_addr *a = 226 struct ceph_entity_addr *a =
227 ceph_osd_addr(osdc->osdmap, dl.osd); 227 ceph_osd_addr(osdc->osdmap, dl.osd);
@@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
230 } else { 230 } else {
231 memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); 231 memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
232 } 232 }
233 up_read(&osdc->map_sem); 233 up_read(&osdc->lock);
234 234
235 /* send result back to user */ 235 /* send result back to user */
236 if (copy_to_user(arg, &dl, sizeof(dl))) 236 if (copy_to_user(arg, &dl, sizeof(dl)))
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 85b8517f17a0..2103b823bec0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -181,17 +181,18 @@ static int parse_reply_info_dir(void **p, void *end,
181 181
182 ceph_decode_need(p, end, sizeof(num) + 2, bad); 182 ceph_decode_need(p, end, sizeof(num) + 2, bad);
183 num = ceph_decode_32(p); 183 num = ceph_decode_32(p);
184 info->dir_end = ceph_decode_8(p); 184 {
185 info->dir_complete = ceph_decode_8(p); 185 u16 flags = ceph_decode_16(p);
186 info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
187 info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
188 info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
189 }
186 if (num == 0) 190 if (num == 0)
187 goto done; 191 goto done;
188 192
189 BUG_ON(!info->dir_in); 193 BUG_ON(!info->dir_entries);
190 info->dir_dname = (void *)(info->dir_in + num); 194 if ((unsigned long)(info->dir_entries + num) >
191 info->dir_dname_len = (void *)(info->dir_dname + num); 195 (unsigned long)info->dir_entries + info->dir_buf_size) {
192 info->dir_dlease = (void *)(info->dir_dname_len + num);
193 if ((unsigned long)(info->dir_dlease + num) >
194 (unsigned long)info->dir_in + info->dir_buf_size) {
195 pr_err("dir contents are larger than expected\n"); 196 pr_err("dir contents are larger than expected\n");
196 WARN_ON(1); 197 WARN_ON(1);
197 goto bad; 198 goto bad;
@@ -199,21 +200,23 @@ static int parse_reply_info_dir(void **p, void *end,
199 200
200 info->dir_nr = num; 201 info->dir_nr = num;
201 while (num) { 202 while (num) {
203 struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
202 /* dentry */ 204 /* dentry */
203 ceph_decode_need(p, end, sizeof(u32)*2, bad); 205 ceph_decode_need(p, end, sizeof(u32)*2, bad);
204 info->dir_dname_len[i] = ceph_decode_32(p); 206 rde->name_len = ceph_decode_32(p);
205 ceph_decode_need(p, end, info->dir_dname_len[i], bad); 207 ceph_decode_need(p, end, rde->name_len, bad);
206 info->dir_dname[i] = *p; 208 rde->name = *p;
207 *p += info->dir_dname_len[i]; 209 *p += rde->name_len;
208 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], 210 dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
209 info->dir_dname[i]); 211 rde->lease = *p;
210 info->dir_dlease[i] = *p;
211 *p += sizeof(struct ceph_mds_reply_lease); 212 *p += sizeof(struct ceph_mds_reply_lease);
212 213
213 /* inode */ 214 /* inode */
214 err = parse_reply_info_in(p, end, &info->dir_in[i], features); 215 err = parse_reply_info_in(p, end, &rde->inode, features);
215 if (err < 0) 216 if (err < 0)
216 goto out_bad; 217 goto out_bad;
218 /* ceph_readdir_prepopulate() will update it */
219 rde->offset = 0;
217 i++; 220 i++;
218 num--; 221 num--;
219 } 222 }
@@ -345,9 +348,9 @@ out_bad:
345 348
346static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) 349static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
347{ 350{
348 if (!info->dir_in) 351 if (!info->dir_entries)
349 return; 352 return;
350 free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); 353 free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
351} 354}
352 355
353 356
@@ -567,51 +570,23 @@ void ceph_mdsc_release_request(struct kref *kref)
567 kfree(req); 570 kfree(req);
568} 571}
569 572
573DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
574
570/* 575/*
571 * lookup session, bump ref if found. 576 * lookup session, bump ref if found.
572 * 577 *
573 * called under mdsc->mutex. 578 * called under mdsc->mutex.
574 */ 579 */
575static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, 580static struct ceph_mds_request *
576 u64 tid) 581lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
577{ 582{
578 struct ceph_mds_request *req; 583 struct ceph_mds_request *req;
579 struct rb_node *n = mdsc->request_tree.rb_node;
580
581 while (n) {
582 req = rb_entry(n, struct ceph_mds_request, r_node);
583 if (tid < req->r_tid)
584 n = n->rb_left;
585 else if (tid > req->r_tid)
586 n = n->rb_right;
587 else {
588 ceph_mdsc_get_request(req);
589 return req;
590 }
591 }
592 return NULL;
593}
594 584
595static void __insert_request(struct ceph_mds_client *mdsc, 585 req = lookup_request(&mdsc->request_tree, tid);
596 struct ceph_mds_request *new) 586 if (req)
597{ 587 ceph_mdsc_get_request(req);
598 struct rb_node **p = &mdsc->request_tree.rb_node;
599 struct rb_node *parent = NULL;
600 struct ceph_mds_request *req = NULL;
601 588
602 while (*p) { 589 return req;
603 parent = *p;
604 req = rb_entry(parent, struct ceph_mds_request, r_node);
605 if (new->r_tid < req->r_tid)
606 p = &(*p)->rb_left;
607 else if (new->r_tid > req->r_tid)
608 p = &(*p)->rb_right;
609 else
610 BUG();
611 }
612
613 rb_link_node(&new->r_node, parent, p);
614 rb_insert_color(&new->r_node, &mdsc->request_tree);
615} 590}
616 591
617/* 592/*
@@ -630,7 +605,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
630 req->r_num_caps); 605 req->r_num_caps);
631 dout("__register_request %p tid %lld\n", req, req->r_tid); 606 dout("__register_request %p tid %lld\n", req, req->r_tid);
632 ceph_mdsc_get_request(req); 607 ceph_mdsc_get_request(req);
633 __insert_request(mdsc, req); 608 insert_request(&mdsc->request_tree, req);
634 609
635 req->r_uid = current_fsuid(); 610 req->r_uid = current_fsuid();
636 req->r_gid = current_fsgid(); 611 req->r_gid = current_fsgid();
@@ -663,8 +638,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
663 } 638 }
664 } 639 }
665 640
666 rb_erase(&req->r_node, &mdsc->request_tree); 641 erase_request(&mdsc->request_tree, req);
667 RB_CLEAR_NODE(&req->r_node);
668 642
669 if (req->r_unsafe_dir && req->r_got_unsafe) { 643 if (req->r_unsafe_dir && req->r_got_unsafe) {
670 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); 644 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@@ -868,12 +842,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
868 int metadata_bytes = 0; 842 int metadata_bytes = 0;
869 int metadata_key_count = 0; 843 int metadata_key_count = 0;
870 struct ceph_options *opt = mdsc->fsc->client->options; 844 struct ceph_options *opt = mdsc->fsc->client->options;
845 struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
871 void *p; 846 void *p;
872 847
873 const char* metadata[][2] = { 848 const char* metadata[][2] = {
874 {"hostname", utsname()->nodename}, 849 {"hostname", utsname()->nodename},
875 {"kernel_version", utsname()->release}, 850 {"kernel_version", utsname()->release},
876 {"entity_id", opt->name ? opt->name : ""}, 851 {"entity_id", opt->name ? : ""},
852 {"root", fsopt->server_path ? : "/"},
877 {NULL, NULL} 853 {NULL, NULL}
878 }; 854 };
879 855
@@ -1149,9 +1125,11 @@ out:
1149static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, 1125static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1150 void *arg) 1126 void *arg)
1151{ 1127{
1128 struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
1152 struct ceph_inode_info *ci = ceph_inode(inode); 1129 struct ceph_inode_info *ci = ceph_inode(inode);
1153 LIST_HEAD(to_remove); 1130 LIST_HEAD(to_remove);
1154 int drop = 0; 1131 bool drop = false;
1132 bool invalidate = false;
1155 1133
1156 dout("removing cap %p, ci is %p, inode is %p\n", 1134 dout("removing cap %p, ci is %p, inode is %p\n",
1157 cap, ci, &ci->vfs_inode); 1135 cap, ci, &ci->vfs_inode);
@@ -1159,8 +1137,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1159 __ceph_remove_cap(cap, false); 1137 __ceph_remove_cap(cap, false);
1160 if (!ci->i_auth_cap) { 1138 if (!ci->i_auth_cap) {
1161 struct ceph_cap_flush *cf; 1139 struct ceph_cap_flush *cf;
1162 struct ceph_mds_client *mdsc = 1140 struct ceph_mds_client *mdsc = fsc->mdsc;
1163 ceph_sb_to_client(inode->i_sb)->mdsc; 1141
1142 ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
1143
1144 if (ci->i_wrbuffer_ref > 0 &&
1145 ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
1146 invalidate = true;
1164 1147
1165 while (true) { 1148 while (true) {
1166 struct rb_node *n = rb_first(&ci->i_cap_flush_tree); 1149 struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
@@ -1183,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1183 inode, ceph_ino(inode)); 1166 inode, ceph_ino(inode));
1184 ci->i_dirty_caps = 0; 1167 ci->i_dirty_caps = 0;
1185 list_del_init(&ci->i_dirty_item); 1168 list_del_init(&ci->i_dirty_item);
1186 drop = 1; 1169 drop = true;
1187 } 1170 }
1188 if (!list_empty(&ci->i_flushing_item)) { 1171 if (!list_empty(&ci->i_flushing_item)) {
1189 pr_warn_ratelimited( 1172 pr_warn_ratelimited(
@@ -1193,7 +1176,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1193 ci->i_flushing_caps = 0; 1176 ci->i_flushing_caps = 0;
1194 list_del_init(&ci->i_flushing_item); 1177 list_del_init(&ci->i_flushing_item);
1195 mdsc->num_cap_flushing--; 1178 mdsc->num_cap_flushing--;
1196 drop = 1; 1179 drop = true;
1197 } 1180 }
1198 spin_unlock(&mdsc->cap_dirty_lock); 1181 spin_unlock(&mdsc->cap_dirty_lock);
1199 1182
@@ -1210,7 +1193,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1210 list_del(&cf->list); 1193 list_del(&cf->list);
1211 ceph_free_cap_flush(cf); 1194 ceph_free_cap_flush(cf);
1212 } 1195 }
1213 while (drop--) 1196
1197 wake_up_all(&ci->i_cap_wq);
1198 if (invalidate)
1199 ceph_queue_invalidate(inode);
1200 if (drop)
1214 iput(inode); 1201 iput(inode);
1215 return 0; 1202 return 0;
1216} 1203}
@@ -1220,12 +1207,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1220 */ 1207 */
1221static void remove_session_caps(struct ceph_mds_session *session) 1208static void remove_session_caps(struct ceph_mds_session *session)
1222{ 1209{
1210 struct ceph_fs_client *fsc = session->s_mdsc->fsc;
1211 struct super_block *sb = fsc->sb;
1223 dout("remove_session_caps on %p\n", session); 1212 dout("remove_session_caps on %p\n", session);
1224 iterate_session_caps(session, remove_session_caps_cb, NULL); 1213 iterate_session_caps(session, remove_session_caps_cb, fsc);
1225 1214
1226 spin_lock(&session->s_cap_lock); 1215 spin_lock(&session->s_cap_lock);
1227 if (session->s_nr_caps > 0) { 1216 if (session->s_nr_caps > 0) {
1228 struct super_block *sb = session->s_mdsc->fsc->sb;
1229 struct inode *inode; 1217 struct inode *inode;
1230 struct ceph_cap *cap, *prev = NULL; 1218 struct ceph_cap *cap, *prev = NULL;
1231 struct ceph_vino vino; 1219 struct ceph_vino vino;
@@ -1270,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
1270{ 1258{
1271 struct ceph_inode_info *ci = ceph_inode(inode); 1259 struct ceph_inode_info *ci = ceph_inode(inode);
1272 1260
1273 wake_up_all(&ci->i_cap_wq);
1274 if (arg) { 1261 if (arg) {
1275 spin_lock(&ci->i_ceph_lock); 1262 spin_lock(&ci->i_ceph_lock);
1276 ci->i_wanted_max_size = 0; 1263 ci->i_wanted_max_size = 0;
1277 ci->i_requested_max_size = 0; 1264 ci->i_requested_max_size = 0;
1278 spin_unlock(&ci->i_ceph_lock); 1265 spin_unlock(&ci->i_ceph_lock);
1279 } 1266 }
1267 wake_up_all(&ci->i_cap_wq);
1280 return 0; 1268 return 0;
1281} 1269}
1282 1270
@@ -1671,8 +1659,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
1671 struct ceph_inode_info *ci = ceph_inode(dir); 1659 struct ceph_inode_info *ci = ceph_inode(dir);
1672 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; 1660 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1673 struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; 1661 struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
1674 size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + 1662 size_t size = sizeof(struct ceph_mds_reply_dir_entry);
1675 sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
1676 int order, num_entries; 1663 int order, num_entries;
1677 1664
1678 spin_lock(&ci->i_ceph_lock); 1665 spin_lock(&ci->i_ceph_lock);
@@ -1683,14 +1670,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
1683 1670
1684 order = get_order(size * num_entries); 1671 order = get_order(size * num_entries);
1685 while (order >= 0) { 1672 while (order >= 0) {
1686 rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL | 1673 rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
1687 __GFP_NOWARN, 1674 __GFP_NOWARN,
1688 order); 1675 order);
1689 if (rinfo->dir_in) 1676 if (rinfo->dir_entries)
1690 break; 1677 break;
1691 order--; 1678 order--;
1692 } 1679 }
1693 if (!rinfo->dir_in) 1680 if (!rinfo->dir_entries)
1694 return -ENOMEM; 1681 return -ENOMEM;
1695 1682
1696 num_entries = (PAGE_SIZE << order) / size; 1683 num_entries = (PAGE_SIZE << order) / size;
@@ -1722,6 +1709,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1722 INIT_LIST_HEAD(&req->r_unsafe_target_item); 1709 INIT_LIST_HEAD(&req->r_unsafe_target_item);
1723 req->r_fmode = -1; 1710 req->r_fmode = -1;
1724 kref_init(&req->r_kref); 1711 kref_init(&req->r_kref);
1712 RB_CLEAR_NODE(&req->r_node);
1725 INIT_LIST_HEAD(&req->r_wait); 1713 INIT_LIST_HEAD(&req->r_wait);
1726 init_completion(&req->r_completion); 1714 init_completion(&req->r_completion);
1727 init_completion(&req->r_safe_completion); 1715 init_completion(&req->r_safe_completion);
@@ -2414,7 +2402,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2414 /* get request, session */ 2402 /* get request, session */
2415 tid = le64_to_cpu(msg->hdr.tid); 2403 tid = le64_to_cpu(msg->hdr.tid);
2416 mutex_lock(&mdsc->mutex); 2404 mutex_lock(&mdsc->mutex);
2417 req = __lookup_request(mdsc, tid); 2405 req = lookup_get_request(mdsc, tid);
2418 if (!req) { 2406 if (!req) {
2419 dout("handle_reply on unknown tid %llu\n", tid); 2407 dout("handle_reply on unknown tid %llu\n", tid);
2420 mutex_unlock(&mdsc->mutex); 2408 mutex_unlock(&mdsc->mutex);
@@ -2604,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
2604 fwd_seq = ceph_decode_32(&p); 2592 fwd_seq = ceph_decode_32(&p);
2605 2593
2606 mutex_lock(&mdsc->mutex); 2594 mutex_lock(&mdsc->mutex);
2607 req = __lookup_request(mdsc, tid); 2595 req = lookup_get_request(mdsc, tid);
2608 if (!req) { 2596 if (!req) {
2609 dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); 2597 dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
2610 goto out; /* dup reply? */ 2598 goto out; /* dup reply? */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ee69a537dba5..e7d38aac7109 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -47,6 +47,14 @@ struct ceph_mds_reply_info_in {
47 u32 pool_ns_len; 47 u32 pool_ns_len;
48}; 48};
49 49
50struct ceph_mds_reply_dir_entry {
51 char *name;
52 u32 name_len;
53 struct ceph_mds_reply_lease *lease;
54 struct ceph_mds_reply_info_in inode;
55 loff_t offset;
56};
57
50/* 58/*
51 * parsed info about an mds reply, including information about 59 * parsed info about an mds reply, including information about
52 * either: 1) the target inode and/or its parent directory and dentry, 60 * either: 1) the target inode and/or its parent directory and dentry,
@@ -73,11 +81,10 @@ struct ceph_mds_reply_info_parsed {
73 struct ceph_mds_reply_dirfrag *dir_dir; 81 struct ceph_mds_reply_dirfrag *dir_dir;
74 size_t dir_buf_size; 82 size_t dir_buf_size;
75 int dir_nr; 83 int dir_nr;
76 char **dir_dname; 84 bool dir_complete;
77 u32 *dir_dname_len; 85 bool dir_end;
78 struct ceph_mds_reply_lease **dir_dlease; 86 bool hash_order;
79 struct ceph_mds_reply_info_in *dir_in; 87 struct ceph_mds_reply_dir_entry *dir_entries;
80 u8 dir_complete, dir_end;
81 }; 88 };
82 89
83 /* for create results */ 90 /* for create results */
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 261531e55e9d..8c3591a7fbae 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
54 const void *start = *p; 54 const void *start = *p;
55 int i, j, n; 55 int i, j, n;
56 int err = -EINVAL; 56 int err = -EINVAL;
57 u16 version; 57 u8 mdsmap_v, mdsmap_cv;
58 58
59 m = kzalloc(sizeof(*m), GFP_NOFS); 59 m = kzalloc(sizeof(*m), GFP_NOFS);
60 if (m == NULL) 60 if (m == NULL)
61 return ERR_PTR(-ENOMEM); 61 return ERR_PTR(-ENOMEM);
62 62
63 ceph_decode_16_safe(p, end, version, bad); 63 ceph_decode_need(p, end, 1 + 1, bad);
64 if (version > 3) { 64 mdsmap_v = ceph_decode_8(p);
65 pr_warn("got mdsmap version %d > 3, failing", version); 65 mdsmap_cv = ceph_decode_8(p);
66 goto bad; 66 if (mdsmap_v >= 4) {
67 u32 mdsmap_len;
68 ceph_decode_32_safe(p, end, mdsmap_len, bad);
69 if (end < *p + mdsmap_len)
70 goto bad;
71 end = *p + mdsmap_len;
67 } 72 }
68 73
69 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); 74 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
@@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
87 u32 namelen; 92 u32 namelen;
88 s32 mds, inc, state; 93 s32 mds, inc, state;
89 u64 state_seq; 94 u64 state_seq;
90 u8 infoversion; 95 u8 info_v;
96 void *info_end = NULL;
91 struct ceph_entity_addr addr; 97 struct ceph_entity_addr addr;
92 u32 num_export_targets; 98 u32 num_export_targets;
93 void *pexport_targets = NULL; 99 void *pexport_targets = NULL;
94 struct ceph_timespec laggy_since; 100 struct ceph_timespec laggy_since;
95 struct ceph_mds_info *info; 101 struct ceph_mds_info *info;
96 102
97 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); 103 ceph_decode_need(p, end, sizeof(u64) + 1, bad);
98 global_id = ceph_decode_64(p); 104 global_id = ceph_decode_64(p);
99 infoversion = ceph_decode_8(p); 105 info_v= ceph_decode_8(p);
106 if (info_v >= 4) {
107 u32 info_len;
108 u8 info_cv;
109 ceph_decode_need(p, end, 1 + sizeof(u32), bad);
110 info_cv = ceph_decode_8(p);
111 info_len = ceph_decode_32(p);
112 info_end = *p + info_len;
113 if (info_end > end)
114 goto bad;
115 }
116
117 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
100 *p += sizeof(u64); 118 *p += sizeof(u64);
101 namelen = ceph_decode_32(p); /* skip mds name */ 119 namelen = ceph_decode_32(p); /* skip mds name */
102 *p += namelen; 120 *p += namelen;
@@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
115 *p += sizeof(u32); 133 *p += sizeof(u32);
116 ceph_decode_32_safe(p, end, namelen, bad); 134 ceph_decode_32_safe(p, end, namelen, bad);
117 *p += namelen; 135 *p += namelen;
118 if (infoversion >= 2) { 136 if (info_v >= 2) {
119 ceph_decode_32_safe(p, end, num_export_targets, bad); 137 ceph_decode_32_safe(p, end, num_export_targets, bad);
120 pexport_targets = *p; 138 pexport_targets = *p;
121 *p += num_export_targets * sizeof(u32); 139 *p += num_export_targets * sizeof(u32);
@@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
123 num_export_targets = 0; 141 num_export_targets = 0;
124 } 142 }
125 143
144 if (info_end && *p != info_end) {
145 if (*p > info_end)
146 goto bad;
147 *p = info_end;
148 }
149
126 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", 150 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
127 i+1, n, global_id, mds, inc, 151 i+1, n, global_id, mds, inc,
128 ceph_pr_addr(&addr.in_addr), 152 ceph_pr_addr(&addr.in_addr),
@@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
163 m->m_cas_pg_pool = ceph_decode_64(p); 187 m->m_cas_pg_pool = ceph_decode_64(p);
164 188
165 /* ok, we don't care about the rest. */ 189 /* ok, we don't care about the rest. */
190 *p = end;
166 dout("mdsmap_decode success epoch %u\n", m->m_epoch); 191 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
167 return m; 192 return m;
168 193
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f12d5e2955c2..91e02481ce06 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
108 * mount options 108 * mount options
109 */ 109 */
110enum { 110enum {
111 Opt_mds_namespace,
111 Opt_wsize, 112 Opt_wsize,
112 Opt_rsize, 113 Opt_rsize,
113 Opt_rasize, 114 Opt_rasize,
@@ -143,6 +144,7 @@ enum {
143}; 144};
144 145
145static match_table_t fsopt_tokens = { 146static match_table_t fsopt_tokens = {
147 {Opt_mds_namespace, "mds_namespace=%d"},
146 {Opt_wsize, "wsize=%d"}, 148 {Opt_wsize, "wsize=%d"},
147 {Opt_rsize, "rsize=%d"}, 149 {Opt_rsize, "rsize=%d"},
148 {Opt_rasize, "rasize=%d"}, 150 {Opt_rasize, "rasize=%d"},
@@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private)
212 break; 214 break;
213 215
214 /* misc */ 216 /* misc */
217 case Opt_mds_namespace:
218 fsopt->mds_namespace = intval;
219 break;
215 case Opt_wsize: 220 case Opt_wsize:
216 fsopt->wsize = intval; 221 fsopt->wsize = intval;
217 break; 222 break;
@@ -297,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
297{ 302{
298 dout("destroy_mount_options %p\n", args); 303 dout("destroy_mount_options %p\n", args);
299 kfree(args->snapdir_name); 304 kfree(args->snapdir_name);
305 kfree(args->server_path);
300 kfree(args); 306 kfree(args);
301} 307}
302 308
@@ -328,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
328 if (ret) 334 if (ret)
329 return ret; 335 return ret;
330 336
337 ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
338 if (ret)
339 return ret;
340
331 return ceph_compare_options(new_opt, fsc->client); 341 return ceph_compare_options(new_opt, fsc->client);
332} 342}
333 343
334static int parse_mount_options(struct ceph_mount_options **pfsopt, 344static int parse_mount_options(struct ceph_mount_options **pfsopt,
335 struct ceph_options **popt, 345 struct ceph_options **popt,
336 int flags, char *options, 346 int flags, char *options,
337 const char *dev_name, 347 const char *dev_name)
338 const char **path)
339{ 348{
340 struct ceph_mount_options *fsopt; 349 struct ceph_mount_options *fsopt;
341 const char *dev_name_end; 350 const char *dev_name_end;
@@ -367,6 +376,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
367 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; 376 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
368 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; 377 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
369 fsopt->congestion_kb = default_congestion_kb(); 378 fsopt->congestion_kb = default_congestion_kb();
379 fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE;
370 380
371 /* 381 /*
372 * Distinguish the server list from the path in "dev_name". 382 * Distinguish the server list from the path in "dev_name".
@@ -380,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
380 */ 390 */
381 dev_name_end = strchr(dev_name, '/'); 391 dev_name_end = strchr(dev_name, '/');
382 if (dev_name_end) { 392 if (dev_name_end) {
383 /* skip over leading '/' for path */ 393 fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
384 *path = dev_name_end + 1; 394 if (!fsopt->server_path) {
395 err = -ENOMEM;
396 goto out;
397 }
385 } else { 398 } else {
386 /* path is empty */
387 dev_name_end = dev_name + strlen(dev_name); 399 dev_name_end = dev_name + strlen(dev_name);
388 *path = dev_name_end;
389 } 400 }
390 err = -EINVAL; 401 err = -EINVAL;
391 dev_name_end--; /* back up to ':' separator */ 402 dev_name_end--; /* back up to ':' separator */
@@ -395,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
395 goto out; 406 goto out;
396 } 407 }
397 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); 408 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
398 dout("server path '%s'\n", *path); 409 if (fsopt->server_path)
410 dout("server path '%s'\n", fsopt->server_path);
399 411
400 *popt = ceph_parse_options(options, dev_name, dev_name_end, 412 *popt = ceph_parse_options(options, dev_name, dev_name_end,
401 parse_fsopt_token, (void *)fsopt); 413 parse_fsopt_token, (void *)fsopt);
@@ -457,6 +469,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
457 seq_puts(m, ",noacl"); 469 seq_puts(m, ",noacl");
458#endif 470#endif
459 471
472 if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE)
473 seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace);
460 if (fsopt->wsize) 474 if (fsopt->wsize)
461 seq_printf(m, ",wsize=%d", fsopt->wsize); 475 seq_printf(m, ",wsize=%d", fsopt->wsize);
462 if (fsopt->rsize != CEPH_RSIZE_DEFAULT) 476 if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -511,9 +525,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
511{ 525{
512 struct ceph_fs_client *fsc; 526 struct ceph_fs_client *fsc;
513 const u64 supported_features = 527 const u64 supported_features =
514 CEPH_FEATURE_FLOCK | 528 CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
515 CEPH_FEATURE_DIRLAYOUTHASH | 529 CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
516 CEPH_FEATURE_MDS_INLINE_DATA;
517 const u64 required_features = 0; 530 const u64 required_features = 0;
518 int page_count; 531 int page_count;
519 size_t size; 532 size_t size;
@@ -530,6 +543,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
530 goto fail; 543 goto fail;
531 } 544 }
532 fsc->client->extra_mon_dispatch = extra_mon_dispatch; 545 fsc->client->extra_mon_dispatch = extra_mon_dispatch;
546 fsc->client->monc.fs_cluster_id = fsopt->mds_namespace;
533 ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); 547 ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
534 548
535 fsc->mount_options = fsopt; 549 fsc->mount_options = fsopt;
@@ -785,8 +799,7 @@ out:
785/* 799/*
786 * mount: join the ceph cluster, and open root directory. 800 * mount: join the ceph cluster, and open root directory.
787 */ 801 */
788static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, 802static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
789 const char *path)
790{ 803{
791 int err; 804 int err;
792 unsigned long started = jiffies; /* note the start time */ 805 unsigned long started = jiffies; /* note the start time */
@@ -815,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
815 goto fail; 828 goto fail;
816 } 829 }
817 830
818 if (path[0] == 0) { 831 if (!fsc->mount_options->server_path) {
819 root = fsc->sb->s_root; 832 root = fsc->sb->s_root;
820 dget(root); 833 dget(root);
821 } else { 834 } else {
822 dout("mount opening base mountpoint\n"); 835 const char *path = fsc->mount_options->server_path + 1;
836 dout("mount opening path %s\n", path);
823 root = open_root_dentry(fsc, path, started); 837 root = open_root_dentry(fsc, path, started);
824 if (IS_ERR(root)) { 838 if (IS_ERR(root)) {
825 err = PTR_ERR(root); 839 err = PTR_ERR(root);
@@ -935,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
935 struct dentry *res; 949 struct dentry *res;
936 int err; 950 int err;
937 int (*compare_super)(struct super_block *, void *) = ceph_compare_super; 951 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
938 const char *path = NULL;
939 struct ceph_mount_options *fsopt = NULL; 952 struct ceph_mount_options *fsopt = NULL;
940 struct ceph_options *opt = NULL; 953 struct ceph_options *opt = NULL;
941 954
@@ -944,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
944#ifdef CONFIG_CEPH_FS_POSIX_ACL 957#ifdef CONFIG_CEPH_FS_POSIX_ACL
945 flags |= MS_POSIXACL; 958 flags |= MS_POSIXACL;
946#endif 959#endif
947 err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); 960 err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
948 if (err < 0) { 961 if (err < 0) {
949 res = ERR_PTR(err); 962 res = ERR_PTR(err);
950 goto out_final; 963 goto out_final;
@@ -987,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
987 } 1000 }
988 } 1001 }
989 1002
990 res = ceph_real_mount(fsc, path); 1003 res = ceph_real_mount(fsc);
991 if (IS_ERR(res)) 1004 if (IS_ERR(res))
992 goto out_splat; 1005 goto out_splat;
993 dout("root %p inode %p ino %llx.%llx\n", res, 1006 dout("root %p inode %p ino %llx.%llx\n", res,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7b99eb756477..0130a8592191 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -62,6 +62,7 @@ struct ceph_mount_options {
62 int cap_release_safety; 62 int cap_release_safety;
63 int max_readdir; /* max readdir result (entires) */ 63 int max_readdir; /* max readdir result (entires) */
64 int max_readdir_bytes; /* max readdir result (bytes) */ 64 int max_readdir_bytes; /* max readdir result (bytes) */
65 int mds_namespace;
65 66
66 /* 67 /*
67 * everything above this point can be memcmp'd; everything below 68 * everything above this point can be memcmp'd; everything below
@@ -69,6 +70,7 @@ struct ceph_mount_options {
69 */ 70 */
70 71
71 char *snapdir_name; /* default ".snap" */ 72 char *snapdir_name; /* default ".snap" */
73 char *server_path; /* default "/" */
72}; 74};
73 75
74struct ceph_fs_client { 76struct ceph_fs_client {
@@ -295,6 +297,7 @@ struct ceph_inode_info {
295 u64 i_files, i_subdirs; 297 u64 i_files, i_subdirs;
296 298
297 struct rb_root i_fragtree; 299 struct rb_root i_fragtree;
300 int i_fragtree_nsplits;
298 struct mutex i_fragtree_mutex; 301 struct mutex i_fragtree_mutex;
299 302
300 struct ceph_inode_xattrs_info i_xattrs; 303 struct ceph_inode_xattrs_info i_xattrs;
@@ -469,6 +472,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
469#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ 472#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */
470#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ 473#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
471#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ 474#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */
475#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */
472 476
473static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, 477static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
474 long long release_count, 478 long long release_count,
@@ -537,11 +541,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
537 return (struct ceph_dentry_info *)dentry->d_fsdata; 541 return (struct ceph_dentry_info *)dentry->d_fsdata;
538} 542}
539 543
540static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
541{
542 return ((loff_t)frag << 32) | (loff_t)off;
543}
544
545/* 544/*
546 * caps helpers 545 * caps helpers
547 */ 546 */
@@ -632,7 +631,6 @@ struct ceph_file_info {
632 struct ceph_mds_request *last_readdir; 631 struct ceph_mds_request *last_readdir;
633 632
634 /* readdir: position within a frag */ 633 /* readdir: position within a frag */
635 unsigned offset; /* offset of last chunk, adjusted for . and .. */
636 unsigned next_offset; /* offset of next chunk (last_name's + 1) */ 634 unsigned next_offset; /* offset of next chunk (last_name's + 1) */
637 char *last_name; /* last entry in previous chunk */ 635 char *last_name; /* last entry in previous chunk */
638 long long dir_release_count; 636 long long dir_release_count;
@@ -927,6 +925,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
927/* file.c */ 925/* file.c */
928extern const struct file_operations ceph_file_fops; 926extern const struct file_operations ceph_file_fops;
929 927
928extern int ceph_renew_caps(struct inode *inode);
930extern int ceph_open(struct inode *inode, struct file *file); 929extern int ceph_open(struct inode *inode, struct file *file);
931extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, 930extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
932 struct file *file, unsigned flags, umode_t mode, 931 struct file *file, unsigned flags, umode_t mode,
@@ -942,6 +941,7 @@ extern const struct inode_operations ceph_snapdir_iops;
942extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, 941extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
943 ceph_snapdir_dentry_ops; 942 ceph_snapdir_dentry_ops;
944 943
944extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
945extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); 945extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
946extern int ceph_handle_snapdir(struct ceph_mds_request *req, 946extern int ceph_handle_snapdir(struct ceph_mds_request *req,
947 struct dentry *dentry, int err); 947 struct dentry *dentry, int err);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 0d66722c6a52..dacc1bd85629 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -77,7 +77,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
77 char buf[128]; 77 char buf[128];
78 78
79 dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); 79 dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
80 down_read(&osdc->map_sem); 80 down_read(&osdc->lock);
81 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); 81 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
82 if (pool_name) { 82 if (pool_name) {
83 size_t len = strlen(pool_name); 83 size_t len = strlen(pool_name);
@@ -109,7 +109,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
109 ret = -ERANGE; 109 ret = -ERANGE;
110 } 110 }
111 } 111 }
112 up_read(&osdc->map_sem); 112 up_read(&osdc->lock);
113 return ret; 113 return ret;
114} 114}
115 115
@@ -143,13 +143,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
143 s64 pool = ceph_file_layout_pg_pool(ci->i_layout); 143 s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
144 const char *pool_name; 144 const char *pool_name;
145 145
146 down_read(&osdc->map_sem); 146 down_read(&osdc->lock);
147 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); 147 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
148 if (pool_name) 148 if (pool_name)
149 ret = snprintf(val, size, "%s", pool_name); 149 ret = snprintf(val, size, "%s", pool_name);
150 else 150 else
151 ret = snprintf(val, size, "%lld", (unsigned long long)pool); 151 ret = snprintf(val, size, "%lld", (unsigned long long)pool);
152 up_read(&osdc->map_sem); 152 up_read(&osdc->lock);
153 return ret; 153 return ret;
154} 154}
155 155
@@ -862,6 +862,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
862 struct ceph_mds_request *req; 862 struct ceph_mds_request *req;
863 struct ceph_mds_client *mdsc = fsc->mdsc; 863 struct ceph_mds_client *mdsc = fsc->mdsc;
864 struct ceph_pagelist *pagelist = NULL; 864 struct ceph_pagelist *pagelist = NULL;
865 int op = CEPH_MDS_OP_SETXATTR;
865 int err; 866 int err;
866 867
867 if (size > 0) { 868 if (size > 0) {
@@ -875,20 +876,21 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
875 if (err) 876 if (err)
876 goto out; 877 goto out;
877 } else if (!value) { 878 } else if (!value) {
878 flags |= CEPH_XATTR_REMOVE; 879 if (flags & CEPH_XATTR_REPLACE)
880 op = CEPH_MDS_OP_RMXATTR;
881 else
882 flags |= CEPH_XATTR_REMOVE;
879 } 883 }
880 884
881 dout("setxattr value=%.*s\n", (int)size, value); 885 dout("setxattr value=%.*s\n", (int)size, value);
882 886
883 /* do request */ 887 /* do request */
884 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, 888 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
885 USE_AUTH_MDS);
886 if (IS_ERR(req)) { 889 if (IS_ERR(req)) {
887 err = PTR_ERR(req); 890 err = PTR_ERR(req);
888 goto out; 891 goto out;
889 } 892 }
890 893
891 req->r_args.setxattr.flags = cpu_to_le32(flags);
892 req->r_path2 = kstrdup(name, GFP_NOFS); 894 req->r_path2 = kstrdup(name, GFP_NOFS);
893 if (!req->r_path2) { 895 if (!req->r_path2) {
894 ceph_mdsc_put_request(req); 896 ceph_mdsc_put_request(req);
@@ -896,8 +898,11 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
896 goto out; 898 goto out;
897 } 899 }
898 900
899 req->r_pagelist = pagelist; 901 if (op == CEPH_MDS_OP_SETXATTR) {
900 pagelist = NULL; 902 req->r_args.setxattr.flags = cpu_to_le32(flags);
903 req->r_pagelist = pagelist;
904 pagelist = NULL;
905 }
901 906
902 req->r_inode = inode; 907 req->r_inode = inode;
903 ihold(inode); 908 ihold(inode);
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
index b827e066e55a..146507df8650 100644
--- a/include/linux/ceph/ceph_frag.h
+++ b/include/linux/ceph/ceph_frag.h
@@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
51 return ceph_frag_make(newbits, 51 return ceph_frag_make(newbits,
52 ceph_frag_value(f) | (i << (24 - newbits))); 52 ceph_frag_value(f) | (i << (24 - newbits)));
53} 53}
54static inline int ceph_frag_is_leftmost(__u32 f) 54static inline bool ceph_frag_is_leftmost(__u32 f)
55{ 55{
56 return ceph_frag_value(f) == 0; 56 return ceph_frag_value(f) == 0;
57} 57}
58static inline int ceph_frag_is_rightmost(__u32 f) 58static inline bool ceph_frag_is_rightmost(__u32 f)
59{ 59{
60 return ceph_frag_value(f) == ceph_frag_mask(f); 60 return ceph_frag_value(f) == ceph_frag_mask(f);
61} 61}
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 37f28bf55ce4..dfce616002ad 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -153,8 +153,9 @@ struct ceph_dir_layout {
153 153
154/* watch-notify operations */ 154/* watch-notify operations */
155enum { 155enum {
156 WATCH_NOTIFY = 1, /* notifying watcher */ 156 CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */
157 WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */ 157 CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */
158 CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */
158}; 159};
159 160
160 161
@@ -207,6 +208,8 @@ struct ceph_mon_subscribe_ack {
207 struct ceph_fsid fsid; 208 struct ceph_fsid fsid;
208} __attribute__ ((packed)); 209} __attribute__ ((packed));
209 210
211#define CEPH_FS_CLUSTER_ID_NONE -1
212
210/* 213/*
211 * mdsmap flags 214 * mdsmap flags
212 */ 215 */
@@ -344,6 +347,18 @@ extern const char *ceph_mds_op_name(int op);
344#define CEPH_XATTR_REPLACE (1 << 1) 347#define CEPH_XATTR_REPLACE (1 << 1)
345#define CEPH_XATTR_REMOVE (1 << 31) 348#define CEPH_XATTR_REMOVE (1 << 31)
346 349
350/*
351 * readdir request flags;
352 */
353#define CEPH_READDIR_REPLY_BITFLAGS (1<<0)
354
355/*
356 * readdir reply flags.
357 */
358#define CEPH_READDIR_FRAG_END (1<<0)
359#define CEPH_READDIR_FRAG_COMPLETE (1<<8)
360#define CEPH_READDIR_HASH_ORDER (1<<9)
361
347union ceph_mds_request_args { 362union ceph_mds_request_args {
348 struct { 363 struct {
349 __le32 mask; /* CEPH_CAP_* */ 364 __le32 mask; /* CEPH_CAP_* */
@@ -361,6 +376,7 @@ union ceph_mds_request_args {
361 __le32 frag; /* which dir fragment */ 376 __le32 frag; /* which dir fragment */
362 __le32 max_entries; /* how many dentries to grab */ 377 __le32 max_entries; /* how many dentries to grab */
363 __le32 max_bytes; 378 __le32 max_bytes;
379 __le16 flags;
364 } __attribute__ ((packed)) readdir; 380 } __attribute__ ((packed)) readdir;
365 struct { 381 struct {
366 __le32 mode; 382 __le32 mode;
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index a6ef9cc267ec..19e9932f3e77 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -47,7 +47,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
47/* 47/*
48 * bounds check input. 48 * bounds check input.
49 */ 49 */
50static inline int ceph_has_room(void **p, void *end, size_t n) 50static inline bool ceph_has_room(void **p, void *end, size_t n)
51{ 51{
52 return end >= *p && n <= end - *p; 52 return end >= *p && n <= end - *p;
53} 53}
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index db92a8d4926e..690985daad1c 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len)
180 (off >> PAGE_SHIFT); 180 (off >> PAGE_SHIFT);
181} 181}
182 182
183/*
184 * These are not meant to be generic - an integer key is assumed.
185 */
186#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
187static void insert_##name(struct rb_root *root, type *t) \
188{ \
189 struct rb_node **n = &root->rb_node; \
190 struct rb_node *parent = NULL; \
191 \
192 BUG_ON(!RB_EMPTY_NODE(&t->nodefld)); \
193 \
194 while (*n) { \
195 type *cur = rb_entry(*n, type, nodefld); \
196 \
197 parent = *n; \
198 if (t->keyfld < cur->keyfld) \
199 n = &(*n)->rb_left; \
200 else if (t->keyfld > cur->keyfld) \
201 n = &(*n)->rb_right; \
202 else \
203 BUG(); \
204 } \
205 \
206 rb_link_node(&t->nodefld, parent, n); \
207 rb_insert_color(&t->nodefld, root); \
208} \
209static void erase_##name(struct rb_root *root, type *t) \
210{ \
211 BUG_ON(RB_EMPTY_NODE(&t->nodefld)); \
212 rb_erase(&t->nodefld, root); \
213 RB_CLEAR_NODE(&t->nodefld); \
214}
215
216#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \
217static type *lookup_##name(struct rb_root *root, \
218 typeof(((type *)0)->keyfld) key) \
219{ \
220 struct rb_node *n = root->rb_node; \
221 \
222 while (n) { \
223 type *cur = rb_entry(n, type, nodefld); \
224 \
225 if (key < cur->keyfld) \
226 n = n->rb_left; \
227 else if (key > cur->keyfld) \
228 n = n->rb_right; \
229 else \
230 return cur; \
231 } \
232 \
233 return NULL; \
234}
235
236#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \
237DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
238DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
239
183extern struct kmem_cache *ceph_inode_cachep; 240extern struct kmem_cache *ceph_inode_cachep;
184extern struct kmem_cache *ceph_cap_cachep; 241extern struct kmem_cache *ceph_cap_cachep;
185extern struct kmem_cache *ceph_cap_flush_cachep; 242extern struct kmem_cache *ceph_cap_flush_cachep;
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index e230e7ed60d3..e2a92df08b47 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -39,20 +39,31 @@ struct ceph_mon_request {
39 ceph_monc_request_func_t do_request; 39 ceph_monc_request_func_t do_request;
40}; 40};
41 41
42typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *);
43
42/* 44/*
43 * ceph_mon_generic_request is being used for the statfs and 45 * ceph_mon_generic_request is being used for the statfs and
44 * mon_get_version requests which are being done a bit differently 46 * mon_get_version requests which are being done a bit differently
45 * because we need to get data back to the caller 47 * because we need to get data back to the caller
46 */ 48 */
47struct ceph_mon_generic_request { 49struct ceph_mon_generic_request {
50 struct ceph_mon_client *monc;
48 struct kref kref; 51 struct kref kref;
49 u64 tid; 52 u64 tid;
50 struct rb_node node; 53 struct rb_node node;
51 int result; 54 int result;
52 void *buf; 55
53 struct completion completion; 56 struct completion completion;
57 ceph_monc_callback_t complete_cb;
58 u64 private_data; /* r_tid/linger_id */
59
54 struct ceph_msg *request; /* original request */ 60 struct ceph_msg *request; /* original request */
55 struct ceph_msg *reply; /* and reply */ 61 struct ceph_msg *reply; /* and reply */
62
63 union {
64 struct ceph_statfs *st;
65 u64 newest;
66 } u;
56}; 67};
57 68
58struct ceph_mon_client { 69struct ceph_mon_client {
@@ -77,7 +88,6 @@ struct ceph_mon_client {
77 88
78 /* pending generic requests */ 89 /* pending generic requests */
79 struct rb_root generic_request_tree; 90 struct rb_root generic_request_tree;
80 int num_generic_requests;
81 u64 last_tid; 91 u64 last_tid;
82 92
83 /* subs, indexed with CEPH_SUB_* */ 93 /* subs, indexed with CEPH_SUB_* */
@@ -86,6 +96,7 @@ struct ceph_mon_client {
86 bool want; 96 bool want;
87 u32 have; /* epoch */ 97 u32 have; /* epoch */
88 } subs[3]; 98 } subs[3];
99 int fs_cluster_id; /* "mdsmap.<id>" sub */
89 100
90#ifdef CONFIG_DEBUG_FS 101#ifdef CONFIG_DEBUG_FS
91 struct dentry *debugfs_file; 102 struct dentry *debugfs_file;
@@ -116,16 +127,18 @@ extern const char *ceph_sub_str[];
116bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, 127bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
117 bool continuous); 128 bool continuous);
118void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); 129void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
130void ceph_monc_renew_subs(struct ceph_mon_client *monc);
119 131
120extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
121extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, 132extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
122 unsigned long timeout); 133 unsigned long timeout);
123 134
124extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, 135extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
125 struct ceph_statfs *buf); 136 struct ceph_statfs *buf);
126 137
127extern int ceph_monc_do_get_version(struct ceph_mon_client *monc, 138int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
128 const char *what, u64 *newest); 139 u64 *newest);
140int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
141 ceph_monc_callback_t cb, u64 private_data);
129 142
130extern int ceph_monc_open_session(struct ceph_mon_client *monc); 143extern int ceph_monc_open_session(struct ceph_mon_client *monc);
131 144
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index cbf460927c42..19b14862d3e0 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -20,10 +20,11 @@ struct ceph_osd_client;
20/* 20/*
21 * completion callback for async writepages 21 * completion callback for async writepages
22 */ 22 */
23typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, 23typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
24 struct ceph_msg *);
25typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); 24typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
26 25
26#define CEPH_HOMELESS_OSD -1
27
27/* a given osd we're communicating with */ 28/* a given osd we're communicating with */
28struct ceph_osd { 29struct ceph_osd {
29 atomic_t o_ref; 30 atomic_t o_ref;
@@ -32,16 +33,15 @@ struct ceph_osd {
32 int o_incarnation; 33 int o_incarnation;
33 struct rb_node o_node; 34 struct rb_node o_node;
34 struct ceph_connection o_con; 35 struct ceph_connection o_con;
35 struct list_head o_requests; 36 struct rb_root o_requests;
36 struct list_head o_linger_requests; 37 struct rb_root o_linger_requests;
37 struct list_head o_osd_lru; 38 struct list_head o_osd_lru;
38 struct ceph_auth_handshake o_auth; 39 struct ceph_auth_handshake o_auth;
39 unsigned long lru_ttl; 40 unsigned long lru_ttl;
40 int o_marked_for_keepalive;
41 struct list_head o_keepalive_item; 41 struct list_head o_keepalive_item;
42 struct mutex lock;
42}; 43};
43 44
44
45#define CEPH_OSD_SLAB_OPS 2 45#define CEPH_OSD_SLAB_OPS 2
46#define CEPH_OSD_MAX_OPS 16 46#define CEPH_OSD_MAX_OPS 16
47 47
@@ -104,76 +104,95 @@ struct ceph_osd_req_op {
104 struct ceph_osd_data response_data; 104 struct ceph_osd_data response_data;
105 __u8 class_len; 105 __u8 class_len;
106 __u8 method_len; 106 __u8 method_len;
107 __u8 argc; 107 u32 indata_len;
108 } cls; 108 } cls;
109 struct { 109 struct {
110 u64 cookie; 110 u64 cookie;
111 u64 ver; 111 __u8 op; /* CEPH_OSD_WATCH_OP_ */
112 u32 prot_ver; 112 u32 gen;
113 u32 timeout;
114 __u8 flag;
115 } watch; 113 } watch;
116 struct { 114 struct {
115 struct ceph_osd_data request_data;
116 } notify_ack;
117 struct {
118 u64 cookie;
119 struct ceph_osd_data request_data;
120 struct ceph_osd_data response_data;
121 } notify;
122 struct {
117 u64 expected_object_size; 123 u64 expected_object_size;
118 u64 expected_write_size; 124 u64 expected_write_size;
119 } alloc_hint; 125 } alloc_hint;
120 }; 126 };
121}; 127};
122 128
129struct ceph_osd_request_target {
130 struct ceph_object_id base_oid;
131 struct ceph_object_locator base_oloc;
132 struct ceph_object_id target_oid;
133 struct ceph_object_locator target_oloc;
134
135 struct ceph_pg pgid;
136 u32 pg_num;
137 u32 pg_num_mask;
138 struct ceph_osds acting;
139 struct ceph_osds up;
140 int size;
141 int min_size;
142 bool sort_bitwise;
143
144 unsigned int flags; /* CEPH_OSD_FLAG_* */
145 bool paused;
146
147 int osd;
148};
149
123/* an in-flight request */ 150/* an in-flight request */
124struct ceph_osd_request { 151struct ceph_osd_request {
125 u64 r_tid; /* unique for this client */ 152 u64 r_tid; /* unique for this client */
126 struct rb_node r_node; 153 struct rb_node r_node;
127 struct list_head r_req_lru_item; 154 struct rb_node r_mc_node; /* map check */
128 struct list_head r_osd_item;
129 struct list_head r_linger_item;
130 struct list_head r_linger_osd_item;
131 struct ceph_osd *r_osd; 155 struct ceph_osd *r_osd;
132 struct ceph_pg r_pgid; 156
133 int r_pg_osds[CEPH_PG_MAX_SIZE]; 157 struct ceph_osd_request_target r_t;
134 int r_num_pg_osds; 158#define r_base_oid r_t.base_oid
159#define r_base_oloc r_t.base_oloc
160#define r_flags r_t.flags
135 161
136 struct ceph_msg *r_request, *r_reply; 162 struct ceph_msg *r_request, *r_reply;
137 int r_flags; /* any additional flags for the osd */
138 u32 r_sent; /* >0 if r_request is sending/sent */ 163 u32 r_sent; /* >0 if r_request is sending/sent */
139 164
140 /* request osd ops array */ 165 /* request osd ops array */
141 unsigned int r_num_ops; 166 unsigned int r_num_ops;
142 167
143 /* these are updated on each send */
144 __le32 *r_request_osdmap_epoch;
145 __le32 *r_request_flags;
146 __le64 *r_request_pool;
147 void *r_request_pgid;
148 __le32 *r_request_attempts;
149 bool r_paused;
150 struct ceph_eversion *r_request_reassert_version;
151
152 int r_result; 168 int r_result;
153 int r_got_reply; 169 bool r_got_reply;
154 int r_linger;
155 170
156 struct ceph_osd_client *r_osdc; 171 struct ceph_osd_client *r_osdc;
157 struct kref r_kref; 172 struct kref r_kref;
158 bool r_mempool; 173 bool r_mempool;
159 struct completion r_completion, r_safe_completion; 174 struct completion r_completion;
175 struct completion r_safe_completion; /* fsync waiter */
160 ceph_osdc_callback_t r_callback; 176 ceph_osdc_callback_t r_callback;
161 ceph_osdc_unsafe_callback_t r_unsafe_callback; 177 ceph_osdc_unsafe_callback_t r_unsafe_callback;
162 struct ceph_eversion r_reassert_version;
163 struct list_head r_unsafe_item; 178 struct list_head r_unsafe_item;
164 179
165 struct inode *r_inode; /* for use by callbacks */ 180 struct inode *r_inode; /* for use by callbacks */
166 void *r_priv; /* ditto */ 181 void *r_priv; /* ditto */
167 182
168 struct ceph_object_locator r_base_oloc; 183 /* set by submitter */
169 struct ceph_object_id r_base_oid; 184 u64 r_snapid; /* for reads, CEPH_NOSNAP o/w */
170 struct ceph_object_locator r_target_oloc; 185 struct ceph_snap_context *r_snapc; /* for writes */
171 struct ceph_object_id r_target_oid; 186 struct timespec r_mtime; /* ditto */
172 187 u64 r_data_offset; /* ditto */
173 u64 r_snapid; 188 bool r_linger; /* don't resend on failure */
174 unsigned long r_stamp; /* send OR check time */
175 189
176 struct ceph_snap_context *r_snapc; /* snap context for writes */ 190 /* internal */
191 unsigned long r_stamp; /* jiffies, send or check time */
192 int r_attempts;
193 struct ceph_eversion r_replay_version; /* aka reassert_version */
194 u32 r_last_force_resend;
195 u32 r_map_dne_bound;
177 196
178 struct ceph_osd_req_op r_ops[]; 197 struct ceph_osd_req_op r_ops[];
179}; 198};
@@ -182,44 +201,70 @@ struct ceph_request_redirect {
182 struct ceph_object_locator oloc; 201 struct ceph_object_locator oloc;
183}; 202};
184 203
185struct ceph_osd_event { 204typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
186 u64 cookie; 205 u64 notifier_id, void *data, size_t data_len);
187 int one_shot; 206typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
207
208struct ceph_osd_linger_request {
188 struct ceph_osd_client *osdc; 209 struct ceph_osd_client *osdc;
189 void (*cb)(u64, u64, u8, void *); 210 u64 linger_id;
190 void *data; 211 bool committed;
191 struct rb_node node; 212 bool is_watch; /* watch or notify */
192 struct list_head osd_node; 213
214 struct ceph_osd *osd;
215 struct ceph_osd_request *reg_req;
216 struct ceph_osd_request *ping_req;
217 unsigned long ping_sent;
218 unsigned long watch_valid_thru;
219 struct list_head pending_lworks;
220
221 struct ceph_osd_request_target t;
222 u32 last_force_resend;
223 u32 map_dne_bound;
224
225 struct timespec mtime;
226
193 struct kref kref; 227 struct kref kref;
194}; 228 struct mutex lock;
229 struct rb_node node; /* osd */
230 struct rb_node osdc_node; /* osdc */
231 struct rb_node mc_node; /* map check */
232 struct list_head scan_item;
233
234 struct completion reg_commit_wait;
235 struct completion notify_finish_wait;
236 int reg_commit_error;
237 int notify_finish_error;
238 int last_error;
239
240 u32 register_gen;
241 u64 notify_id;
242
243 rados_watchcb2_t wcb;
244 rados_watcherrcb_t errcb;
245 void *data;
195 246
196struct ceph_osd_event_work { 247 struct page ***preply_pages;
197 struct work_struct work; 248 size_t *preply_len;
198 struct ceph_osd_event *event;
199 u64 ver;
200 u64 notify_id;
201 u8 opcode;
202}; 249};
203 250
204struct ceph_osd_client { 251struct ceph_osd_client {
205 struct ceph_client *client; 252 struct ceph_client *client;
206 253
207 struct ceph_osdmap *osdmap; /* current map */ 254 struct ceph_osdmap *osdmap; /* current map */
208 struct rw_semaphore map_sem; 255 struct rw_semaphore lock;
209 struct completion map_waiters;
210 u64 last_requested_map;
211 256
212 struct mutex request_mutex;
213 struct rb_root osds; /* osds */ 257 struct rb_root osds; /* osds */
214 struct list_head osd_lru; /* idle osds */ 258 struct list_head osd_lru; /* idle osds */
215 u64 timeout_tid; /* tid of timeout triggering rq */ 259 spinlock_t osd_lru_lock;
216 u64 last_tid; /* tid of last request */ 260 struct ceph_osd homeless_osd;
217 struct rb_root requests; /* pending requests */ 261 atomic64_t last_tid; /* tid of last request */
218 struct list_head req_lru; /* in-flight lru */ 262 u64 last_linger_id;
219 struct list_head req_unsent; /* unsent/need-resend queue */ 263 struct rb_root linger_requests; /* lingering requests */
220 struct list_head req_notarget; /* map to no osd */ 264 struct rb_root map_checks;
221 struct list_head req_linger; /* lingering requests */ 265 struct rb_root linger_map_checks;
222 int num_requests; 266 atomic_t num_requests;
267 atomic_t num_homeless;
223 struct delayed_work timeout_work; 268 struct delayed_work timeout_work;
224 struct delayed_work osds_timeout_work; 269 struct delayed_work osds_timeout_work;
225#ifdef CONFIG_DEBUG_FS 270#ifdef CONFIG_DEBUG_FS
@@ -231,10 +276,6 @@ struct ceph_osd_client {
231 struct ceph_msgpool msgpool_op; 276 struct ceph_msgpool msgpool_op;
232 struct ceph_msgpool msgpool_op_reply; 277 struct ceph_msgpool msgpool_op_reply;
233 278
234 spinlock_t event_lock;
235 struct rb_root event_tree;
236 u64 event_count;
237
238 struct workqueue_struct *notify_wq; 279 struct workqueue_struct *notify_wq;
239}; 280};
240 281
@@ -271,9 +312,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
271extern struct ceph_osd_data *osd_req_op_extent_osd_data( 312extern struct ceph_osd_data *osd_req_op_extent_osd_data(
272 struct ceph_osd_request *osd_req, 313 struct ceph_osd_request *osd_req,
273 unsigned int which); 314 unsigned int which);
274extern struct ceph_osd_data *osd_req_op_cls_response_data(
275 struct ceph_osd_request *osd_req,
276 unsigned int which);
277 315
278extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, 316extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
279 unsigned int which, 317 unsigned int which,
@@ -309,9 +347,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
309extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, 347extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
310 u16 opcode, const char *name, const void *value, 348 u16 opcode, const char *name, const void *value,
311 size_t size, u8 cmp_op, u8 cmp_mode); 349 size_t size, u8 cmp_op, u8 cmp_mode);
312extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
313 unsigned int which, u16 opcode,
314 u64 cookie, u64 version, int flag);
315extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, 350extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
316 unsigned int which, 351 unsigned int which,
317 u64 expected_object_size, 352 u64 expected_object_size,
@@ -322,11 +357,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
322 unsigned int num_ops, 357 unsigned int num_ops,
323 bool use_mempool, 358 bool use_mempool,
324 gfp_t gfp_flags); 359 gfp_t gfp_flags);
325 360int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
326extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
327 struct ceph_snap_context *snapc,
328 u64 snap_id,
329 struct timespec *mtime);
330 361
331extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, 362extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
332 struct ceph_file_layout *layout, 363 struct ceph_file_layout *layout,
@@ -338,9 +369,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
338 u32 truncate_seq, u64 truncate_size, 369 u32 truncate_seq, u64 truncate_size,
339 bool use_mempool); 370 bool use_mempool);
340 371
341extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
342 struct ceph_osd_request *req);
343
344extern void ceph_osdc_get_request(struct ceph_osd_request *req); 372extern void ceph_osdc_get_request(struct ceph_osd_request *req);
345extern void ceph_osdc_put_request(struct ceph_osd_request *req); 373extern void ceph_osdc_put_request(struct ceph_osd_request *req);
346 374
@@ -353,6 +381,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
353extern void ceph_osdc_sync(struct ceph_osd_client *osdc); 381extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
354 382
355extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc); 383extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
384void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
356 385
357extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, 386extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
358 struct ceph_vino vino, 387 struct ceph_vino vino,
@@ -371,11 +400,33 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
371 struct timespec *mtime, 400 struct timespec *mtime,
372 struct page **pages, int nr_pages); 401 struct page **pages, int nr_pages);
373 402
374/* watch/notify events */ 403/* watch/notify */
375extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, 404struct ceph_osd_linger_request *
376 void (*event_cb)(u64, u64, u8, void *), 405ceph_osdc_watch(struct ceph_osd_client *osdc,
377 void *data, struct ceph_osd_event **pevent); 406 struct ceph_object_id *oid,
378extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); 407 struct ceph_object_locator *oloc,
379extern void ceph_osdc_put_event(struct ceph_osd_event *event); 408 rados_watchcb2_t wcb,
409 rados_watcherrcb_t errcb,
410 void *data);
411int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
412 struct ceph_osd_linger_request *lreq);
413
414int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
415 struct ceph_object_id *oid,
416 struct ceph_object_locator *oloc,
417 u64 notify_id,
418 u64 cookie,
419 void *payload,
420 size_t payload_len);
421int ceph_osdc_notify(struct ceph_osd_client *osdc,
422 struct ceph_object_id *oid,
423 struct ceph_object_locator *oloc,
424 void *payload,
425 size_t payload_len,
426 u32 timeout,
427 struct page ***preply_pages,
428 size_t *preply_len);
429int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
430 struct ceph_osd_linger_request *lreq);
380#endif 431#endif
381 432
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index e55c08bc3a96..ddc426b22d81 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -24,21 +24,29 @@ struct ceph_pg {
24 uint32_t seed; 24 uint32_t seed;
25}; 25};
26 26
27#define CEPH_POOL_FLAG_HASHPSPOOL 1 27int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
28
29#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id
30 together */
31#define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */
28 32
29struct ceph_pg_pool_info { 33struct ceph_pg_pool_info {
30 struct rb_node node; 34 struct rb_node node;
31 s64 id; 35 s64 id;
32 u8 type; 36 u8 type; /* CEPH_POOL_TYPE_* */
33 u8 size; 37 u8 size;
38 u8 min_size;
34 u8 crush_ruleset; 39 u8 crush_ruleset;
35 u8 object_hash; 40 u8 object_hash;
41 u32 last_force_request_resend;
36 u32 pg_num, pgp_num; 42 u32 pg_num, pgp_num;
37 int pg_num_mask, pgp_num_mask; 43 int pg_num_mask, pgp_num_mask;
38 s64 read_tier; 44 s64 read_tier;
39 s64 write_tier; /* wins for read+write ops */ 45 s64 write_tier; /* wins for read+write ops */
40 u64 flags; 46 u64 flags; /* CEPH_POOL_FLAG_* */
41 char *name; 47 char *name;
48
49 bool was_full; /* for handle_one_map() */
42}; 50};
43 51
44static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) 52static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
@@ -57,6 +65,22 @@ struct ceph_object_locator {
57 s64 pool; 65 s64 pool;
58}; 66};
59 67
68static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
69{
70 oloc->pool = -1;
71}
72
73static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
74{
75 return oloc->pool == -1;
76}
77
78static inline void ceph_oloc_copy(struct ceph_object_locator *dest,
79 const struct ceph_object_locator *src)
80{
81 dest->pool = src->pool;
82}
83
60/* 84/*
61 * Maximum supported by kernel client object name length 85 * Maximum supported by kernel client object name length
62 * 86 *
@@ -64,11 +88,47 @@ struct ceph_object_locator {
64 */ 88 */
65#define CEPH_MAX_OID_NAME_LEN 100 89#define CEPH_MAX_OID_NAME_LEN 100
66 90
91/*
92 * 51-char inline_name is long enough for all cephfs and all but one
93 * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
94 * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all
95 * other rbd requests fit into inline_name.
96 *
97 * Makes ceph_object_id 64 bytes on 64-bit.
98 */
99#define CEPH_OID_INLINE_LEN 52
100
101/*
102 * Both inline and external buffers have space for a NUL-terminator,
103 * which is carried around. It's not required though - RADOS object
104 * names don't have to be NUL-terminated and may contain NULs.
105 */
67struct ceph_object_id { 106struct ceph_object_id {
68 char name[CEPH_MAX_OID_NAME_LEN]; 107 char *name;
108 char inline_name[CEPH_OID_INLINE_LEN];
69 int name_len; 109 int name_len;
70}; 110};
71 111
112static inline void ceph_oid_init(struct ceph_object_id *oid)
113{
114 oid->name = oid->inline_name;
115 oid->name_len = 0;
116}
117
118static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
119{
120 return oid->name == oid->inline_name && !oid->name_len;
121}
122
123void ceph_oid_copy(struct ceph_object_id *dest,
124 const struct ceph_object_id *src);
125__printf(2, 3)
126void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
127__printf(3, 4)
128int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
129 const char *fmt, ...);
130void ceph_oid_destroy(struct ceph_object_id *oid);
131
72struct ceph_pg_mapping { 132struct ceph_pg_mapping {
73 struct rb_node node; 133 struct rb_node node;
74 struct ceph_pg pgid; 134 struct ceph_pg pgid;
@@ -87,7 +147,6 @@ struct ceph_pg_mapping {
87struct ceph_osdmap { 147struct ceph_osdmap {
88 struct ceph_fsid fsid; 148 struct ceph_fsid fsid;
89 u32 epoch; 149 u32 epoch;
90 u32 mkfs_epoch;
91 struct ceph_timespec created, modified; 150 struct ceph_timespec created, modified;
92 151
93 u32 flags; /* CEPH_OSDMAP_* */ 152 u32 flags; /* CEPH_OSDMAP_* */
@@ -113,43 +172,19 @@ struct ceph_osdmap {
113 int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; 172 int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
114}; 173};
115 174
116static inline void ceph_oid_set_name(struct ceph_object_id *oid, 175static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
117 const char *name)
118{
119 int len;
120
121 len = strlen(name);
122 if (len > sizeof(oid->name)) {
123 WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n",
124 name, len, sizeof(oid->name));
125 len = sizeof(oid->name);
126 }
127
128 memcpy(oid->name, name, len);
129 oid->name_len = len;
130}
131
132static inline void ceph_oid_copy(struct ceph_object_id *dest,
133 struct ceph_object_id *src)
134{
135 BUG_ON(src->name_len > sizeof(dest->name));
136 memcpy(dest->name, src->name, src->name_len);
137 dest->name_len = src->name_len;
138}
139
140static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
141{ 176{
142 return osd >= 0 && osd < map->max_osd && 177 return osd >= 0 && osd < map->max_osd &&
143 (map->osd_state[osd] & CEPH_OSD_EXISTS); 178 (map->osd_state[osd] & CEPH_OSD_EXISTS);
144} 179}
145 180
146static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) 181static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
147{ 182{
148 return ceph_osd_exists(map, osd) && 183 return ceph_osd_exists(map, osd) &&
149 (map->osd_state[osd] & CEPH_OSD_UP); 184 (map->osd_state[osd] & CEPH_OSD_UP);
150} 185}
151 186
152static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd) 187static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
153{ 188{
154 return !ceph_osd_is_up(map, osd); 189 return !ceph_osd_is_up(map, osd);
155} 190}
@@ -192,28 +227,59 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
192 return 0; 227 return 0;
193} 228}
194 229
230struct ceph_osdmap *ceph_osdmap_alloc(void);
195extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); 231extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
196extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, 232struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
197 struct ceph_osdmap *map, 233 struct ceph_osdmap *map);
198 struct ceph_messenger *msgr);
199extern void ceph_osdmap_destroy(struct ceph_osdmap *map); 234extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
200 235
236struct ceph_osds {
237 int osds[CEPH_PG_MAX_SIZE];
238 int size;
239 int primary; /* id, NOT index */
240};
241
242static inline void ceph_osds_init(struct ceph_osds *set)
243{
244 set->size = 0;
245 set->primary = -1;
246}
247
248void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
249
250bool ceph_is_new_interval(const struct ceph_osds *old_acting,
251 const struct ceph_osds *new_acting,
252 const struct ceph_osds *old_up,
253 const struct ceph_osds *new_up,
254 int old_size,
255 int new_size,
256 int old_min_size,
257 int new_min_size,
258 u32 old_pg_num,
259 u32 new_pg_num,
260 bool old_sort_bitwise,
261 bool new_sort_bitwise,
262 const struct ceph_pg *pgid);
263bool ceph_osds_changed(const struct ceph_osds *old_acting,
264 const struct ceph_osds *new_acting,
265 bool any_change);
266
201/* calculate mapping of a file extent to an object */ 267/* calculate mapping of a file extent to an object */
202extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 268extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
203 u64 off, u64 len, 269 u64 off, u64 len,
204 u64 *bno, u64 *oxoff, u64 *oxlen); 270 u64 *bno, u64 *oxoff, u64 *oxlen);
205 271
206/* calculate mapping of object to a placement group */ 272int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
207extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, 273 struct ceph_object_id *oid,
208 struct ceph_object_locator *oloc, 274 struct ceph_object_locator *oloc,
209 struct ceph_object_id *oid, 275 struct ceph_pg *raw_pgid);
210 struct ceph_pg *pg_out); 276
211 277void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
212extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, 278 const struct ceph_pg *raw_pgid,
213 struct ceph_pg pgid, 279 struct ceph_osds *up,
214 int *osds, int *primary); 280 struct ceph_osds *acting);
215extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, 281int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
216 struct ceph_pg pgid); 282 const struct ceph_pg *raw_pgid);
217 283
218extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, 284extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
219 u64 id); 285 u64 id);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 2f822dca1046..5c0da61cb763 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -114,8 +114,8 @@ struct ceph_object_layout {
114 * compound epoch+version, used by storage layer to serialize mutations 114 * compound epoch+version, used by storage layer to serialize mutations
115 */ 115 */
116struct ceph_eversion { 116struct ceph_eversion {
117 __le32 epoch;
118 __le64 version; 117 __le64 version;
118 __le32 epoch;
119} __attribute__ ((packed)); 119} __attribute__ ((packed));
120 120
121/* 121/*
@@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s);
153#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ 153#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
154#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ 154#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
155#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ 155#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
156#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */
157#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
158#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
159#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
160#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
156 161
157/* 162/*
158 * The error code to return when an OSD can't handle a write 163 * The error code to return when an OSD can't handle a write
@@ -389,6 +394,13 @@ enum {
389 CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ 394 CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */
390 CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */ 395 CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
391 CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ 396 CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */
397 CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000, /* map snap direct to clone id */
398 CEPH_OSD_FLAG_ENFORCE_SNAPC = 0x100000, /* use snapc provided even if
399 pool uses pool snaps */
400 CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */
401 CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
402 CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
403 CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
392}; 404};
393 405
394enum { 406enum {
@@ -415,7 +427,17 @@ enum {
415 CEPH_OSD_CMPXATTR_MODE_U64 = 2 427 CEPH_OSD_CMPXATTR_MODE_U64 = 2
416}; 428};
417 429
418#define RADOS_NOTIFY_VER 1 430enum {
431 CEPH_OSD_WATCH_OP_UNWATCH = 0,
432 CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
433 /* note: use only ODD ids to prevent pre-giant code from
434 interpreting the op as UNWATCH */
435 CEPH_OSD_WATCH_OP_WATCH = 3,
436 CEPH_OSD_WATCH_OP_RECONNECT = 5,
437 CEPH_OSD_WATCH_OP_PING = 7,
438};
439
440const char *ceph_osd_watch_op_name(int o);
419 441
420/* 442/*
421 * an individual object operation. each may be accompanied by some data 443 * an individual object operation. each may be accompanied by some data
@@ -450,10 +472,14 @@ struct ceph_osd_op {
450 } __attribute__ ((packed)) snap; 472 } __attribute__ ((packed)) snap;
451 struct { 473 struct {
452 __le64 cookie; 474 __le64 cookie;
453 __le64 ver; 475 __le64 ver; /* no longer used */
454 __u8 flag; /* 0 = unwatch, 1 = watch */ 476 __u8 op; /* CEPH_OSD_WATCH_OP_* */
477 __le32 gen; /* registration generation */
455 } __attribute__ ((packed)) watch; 478 } __attribute__ ((packed)) watch;
456 struct { 479 struct {
480 __le64 cookie;
481 } __attribute__ ((packed)) notify;
482 struct {
457 __le64 offset, length; 483 __le64 offset, length;
458 __le64 src_offset; 484 __le64 src_offset;
459 } __attribute__ ((packed)) clonerange; 485 } __attribute__ ((packed)) clonerange;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index dcc18c6f7cf9..55d2bfee16d7 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client);
651/* 651/*
652 * true if we have the mon map (and have thus joined the cluster) 652 * true if we have the mon map (and have thus joined the cluster)
653 */ 653 */
654static int have_mon_and_osd_map(struct ceph_client *client) 654static bool have_mon_and_osd_map(struct ceph_client *client)
655{ 655{
656 return client->monc.monmap && client->monc.monmap->epoch && 656 return client->monc.monmap && client->monc.monmap->epoch &&
657 client->osdc.osdmap && client->osdc.osdmap->epoch; 657 client->osdc.osdmap && client->osdc.osdmap->epoch;
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 139a9cb19b0c..3773a4fa11e3 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
27 } 27 }
28} 28}
29 29
30const char *ceph_osd_watch_op_name(int o)
31{
32 switch (o) {
33 case CEPH_OSD_WATCH_OP_UNWATCH:
34 return "unwatch";
35 case CEPH_OSD_WATCH_OP_WATCH:
36 return "watch";
37 case CEPH_OSD_WATCH_OP_RECONNECT:
38 return "reconnect";
39 case CEPH_OSD_WATCH_OP_PING:
40 return "ping";
41 default:
42 return "???";
43 }
44}
45
30const char *ceph_osd_state_name(int s) 46const char *ceph_osd_state_name(int s)
31{ 47{
32 switch (s) { 48 switch (s) {
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index b902fbc7863e..e77b04ca7802 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -54,24 +54,25 @@ static int osdmap_show(struct seq_file *s, void *p)
54{ 54{
55 int i; 55 int i;
56 struct ceph_client *client = s->private; 56 struct ceph_client *client = s->private;
57 struct ceph_osdmap *map = client->osdc.osdmap; 57 struct ceph_osd_client *osdc = &client->osdc;
58 struct ceph_osdmap *map = osdc->osdmap;
58 struct rb_node *n; 59 struct rb_node *n;
59 60
60 if (map == NULL) 61 if (map == NULL)
61 return 0; 62 return 0;
62 63
63 seq_printf(s, "epoch %d\n", map->epoch); 64 down_read(&osdc->lock);
64 seq_printf(s, "flags%s%s\n", 65 seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
65 (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "",
66 (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : "");
67 66
68 for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { 67 for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
69 struct ceph_pg_pool_info *pool = 68 struct ceph_pg_pool_info *pi =
70 rb_entry(n, struct ceph_pg_pool_info, node); 69 rb_entry(n, struct ceph_pg_pool_info, node);
71 70
72 seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n", 71 seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n",
73 pool->id, pool->pg_num, pool->pg_num_mask, 72 pi->id, pi->name, pi->type, pi->size, pi->min_size,
74 pool->read_tier, pool->write_tier); 73 pi->pg_num, pi->pg_num_mask, pi->flags,
74 pi->last_force_request_resend, pi->read_tier,
75 pi->write_tier);
75 } 76 }
76 for (i = 0; i < map->max_osd; i++) { 77 for (i = 0; i < map->max_osd; i++) {
77 struct ceph_entity_addr *addr = &map->osd_addr[i]; 78 struct ceph_entity_addr *addr = &map->osd_addr[i];
@@ -103,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p)
103 pg->pgid.seed, pg->primary_temp.osd); 104 pg->pgid.seed, pg->primary_temp.osd);
104 } 105 }
105 106
107 up_read(&osdc->lock);
106 return 0; 108 return 0;
107} 109}
108 110
@@ -126,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p)
126 CEPH_SUBSCRIBE_ONETIME ? "" : "+")); 128 CEPH_SUBSCRIBE_ONETIME ? "" : "+"));
127 seq_putc(s, '\n'); 129 seq_putc(s, '\n');
128 } 130 }
131 seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id);
129 132
130 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { 133 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
131 __u16 op; 134 __u16 op;
@@ -143,43 +146,113 @@ static int monc_show(struct seq_file *s, void *p)
143 return 0; 146 return 0;
144} 147}
145 148
146static int osdc_show(struct seq_file *s, void *pp) 149static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
147{ 150{
148 struct ceph_client *client = s->private; 151 int i;
149 struct ceph_osd_client *osdc = &client->osdc;
150 struct rb_node *p;
151 152
152 mutex_lock(&osdc->request_mutex); 153 seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
153 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { 154 for (i = 0; i < t->up.size; i++)
154 struct ceph_osd_request *req; 155 seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
155 unsigned int i; 156 seq_printf(s, "]/%d\t[", t->up.primary);
156 int opcode; 157 for (i = 0; i < t->acting.size; i++)
158 seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
159 seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
160 t->target_oid.name_len, t->target_oid.name, t->flags);
161 if (t->paused)
162 seq_puts(s, "\tP");
163}
157 164
158 req = rb_entry(p, struct ceph_osd_request, r_node); 165static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
166{
167 int i;
159 168
160 seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid, 169 seq_printf(s, "%llu\t", req->r_tid);
161 req->r_osd ? req->r_osd->o_osd : -1, 170 dump_target(s, &req->r_t);
162 req->r_pgid.pool, req->r_pgid.seed);
163 171
164 seq_printf(s, "%.*s", req->r_base_oid.name_len, 172 seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
165 req->r_base_oid.name); 173 le32_to_cpu(req->r_replay_version.epoch),
174 le64_to_cpu(req->r_replay_version.version));
166 175
167 if (req->r_reassert_version.epoch) 176 for (i = 0; i < req->r_num_ops; i++) {
168 seq_printf(s, "\t%u'%llu", 177 struct ceph_osd_req_op *op = &req->r_ops[i];
169 (unsigned int)le32_to_cpu(req->r_reassert_version.epoch), 178
170 le64_to_cpu(req->r_reassert_version.version)); 179 seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
171 else 180 ceph_osd_op_name(op->op));
172 seq_printf(s, "\t"); 181 if (op->op == CEPH_OSD_OP_WATCH)
182 seq_printf(s, "-%s",
183 ceph_osd_watch_op_name(op->watch.op));
184 }
185
186 seq_putc(s, '\n');
187}
188
189static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
190{
191 struct rb_node *n;
192
193 mutex_lock(&osd->lock);
194 for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
195 struct ceph_osd_request *req =
196 rb_entry(n, struct ceph_osd_request, r_node);
197
198 dump_request(s, req);
199 }
200
201 mutex_unlock(&osd->lock);
202}
173 203
174 for (i = 0; i < req->r_num_ops; i++) { 204static void dump_linger_request(struct seq_file *s,
175 opcode = req->r_ops[i].op; 205 struct ceph_osd_linger_request *lreq)
176 seq_printf(s, "%s%s", (i == 0 ? "\t" : ","), 206{
177 ceph_osd_op_name(opcode)); 207 seq_printf(s, "%llu\t", lreq->linger_id);
178 } 208 dump_target(s, &lreq->t);
209
210 seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
211 lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
212 lreq->last_error);
213}
214
215static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
216{
217 struct rb_node *n;
218
219 mutex_lock(&osd->lock);
220 for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
221 struct ceph_osd_linger_request *lreq =
222 rb_entry(n, struct ceph_osd_linger_request, node);
223
224 dump_linger_request(s, lreq);
225 }
226
227 mutex_unlock(&osd->lock);
228}
179 229
180 seq_printf(s, "\n"); 230static int osdc_show(struct seq_file *s, void *pp)
231{
232 struct ceph_client *client = s->private;
233 struct ceph_osd_client *osdc = &client->osdc;
234 struct rb_node *n;
235
236 down_read(&osdc->lock);
237 seq_printf(s, "REQUESTS %d homeless %d\n",
238 atomic_read(&osdc->num_requests),
239 atomic_read(&osdc->num_homeless));
240 for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
241 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
242
243 dump_requests(s, osd);
181 } 244 }
182 mutex_unlock(&osdc->request_mutex); 245 dump_requests(s, &osdc->homeless_osd);
246
247 seq_puts(s, "LINGER REQUESTS\n");
248 for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
249 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
250
251 dump_linger_requests(s, osd);
252 }
253 dump_linger_requests(s, &osdc->homeless_osd);
254
255 up_read(&osdc->lock);
183 return 0; 256 return 0;
184} 257}
185 258
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index cf638c009cfa..37c38a7fb5c5 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc)
260 BUG_ON(num < 1); /* monmap sub is always there */ 260 BUG_ON(num < 1); /* monmap sub is always there */
261 ceph_encode_32(&p, num); 261 ceph_encode_32(&p, num);
262 for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { 262 for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
263 const char *s = ceph_sub_str[i]; 263 char buf[32];
264 int len;
264 265
265 if (!monc->subs[i].want) 266 if (!monc->subs[i].want)
266 continue; 267 continue;
267 268
268 dout("%s %s start %llu flags 0x%x\n", __func__, s, 269 len = sprintf(buf, "%s", ceph_sub_str[i]);
270 if (i == CEPH_SUB_MDSMAP &&
271 monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE)
272 len += sprintf(buf + len, ".%d", monc->fs_cluster_id);
273
274 dout("%s %s start %llu flags 0x%x\n", __func__, buf,
269 le64_to_cpu(monc->subs[i].item.start), 275 le64_to_cpu(monc->subs[i].item.start),
270 monc->subs[i].item.flags); 276 monc->subs[i].item.flags);
271 ceph_encode_string(&p, end, s, strlen(s)); 277 ceph_encode_string(&p, end, buf, len);
272 memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item)); 278 memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
273 p += sizeof(monc->subs[i].item); 279 p += sizeof(monc->subs[i].item);
274 } 280 }
275 281
276 BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19)); 282 BUG_ON(p > end);
277 msg->front.iov_len = p - msg->front.iov_base; 283 msg->front.iov_len = p - msg->front.iov_base;
278 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 284 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
279 ceph_msg_revoke(msg); 285 ceph_msg_revoke(msg);
@@ -376,19 +382,13 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
376} 382}
377EXPORT_SYMBOL(ceph_monc_got_map); 383EXPORT_SYMBOL(ceph_monc_got_map);
378 384
379/* 385void ceph_monc_renew_subs(struct ceph_mon_client *monc)
380 * Register interest in the next osdmap
381 */
382void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
383{ 386{
384 dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
385 mutex_lock(&monc->mutex); 387 mutex_lock(&monc->mutex);
386 if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, 388 __send_subscribe(monc);
387 monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
388 __send_subscribe(monc);
389 mutex_unlock(&monc->mutex); 389 mutex_unlock(&monc->mutex);
390} 390}
391EXPORT_SYMBOL(ceph_monc_request_next_osdmap); 391EXPORT_SYMBOL(ceph_monc_renew_subs);
392 392
393/* 393/*
394 * Wait for an osdmap with a given epoch. 394 * Wait for an osdmap with a given epoch.
@@ -478,51 +478,17 @@ out:
478/* 478/*
479 * generic requests (currently statfs, mon_get_version) 479 * generic requests (currently statfs, mon_get_version)
480 */ 480 */
481static struct ceph_mon_generic_request *__lookup_generic_req( 481DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
482 struct ceph_mon_client *monc, u64 tid)
483{
484 struct ceph_mon_generic_request *req;
485 struct rb_node *n = monc->generic_request_tree.rb_node;
486
487 while (n) {
488 req = rb_entry(n, struct ceph_mon_generic_request, node);
489 if (tid < req->tid)
490 n = n->rb_left;
491 else if (tid > req->tid)
492 n = n->rb_right;
493 else
494 return req;
495 }
496 return NULL;
497}
498
499static void __insert_generic_request(struct ceph_mon_client *monc,
500 struct ceph_mon_generic_request *new)
501{
502 struct rb_node **p = &monc->generic_request_tree.rb_node;
503 struct rb_node *parent = NULL;
504 struct ceph_mon_generic_request *req = NULL;
505
506 while (*p) {
507 parent = *p;
508 req = rb_entry(parent, struct ceph_mon_generic_request, node);
509 if (new->tid < req->tid)
510 p = &(*p)->rb_left;
511 else if (new->tid > req->tid)
512 p = &(*p)->rb_right;
513 else
514 BUG();
515 }
516
517 rb_link_node(&new->node, parent, p);
518 rb_insert_color(&new->node, &monc->generic_request_tree);
519}
520 482
521static void release_generic_request(struct kref *kref) 483static void release_generic_request(struct kref *kref)
522{ 484{
523 struct ceph_mon_generic_request *req = 485 struct ceph_mon_generic_request *req =
524 container_of(kref, struct ceph_mon_generic_request, kref); 486 container_of(kref, struct ceph_mon_generic_request, kref);
525 487
488 dout("%s greq %p request %p reply %p\n", __func__, req, req->request,
489 req->reply);
490 WARN_ON(!RB_EMPTY_NODE(&req->node));
491
526 if (req->reply) 492 if (req->reply)
527 ceph_msg_put(req->reply); 493 ceph_msg_put(req->reply);
528 if (req->request) 494 if (req->request)
@@ -533,7 +499,8 @@ static void release_generic_request(struct kref *kref)
533 499
534static void put_generic_request(struct ceph_mon_generic_request *req) 500static void put_generic_request(struct ceph_mon_generic_request *req)
535{ 501{
536 kref_put(&req->kref, release_generic_request); 502 if (req)
503 kref_put(&req->kref, release_generic_request);
537} 504}
538 505
539static void get_generic_request(struct ceph_mon_generic_request *req) 506static void get_generic_request(struct ceph_mon_generic_request *req)
@@ -541,6 +508,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req)
541 kref_get(&req->kref); 508 kref_get(&req->kref);
542} 509}
543 510
511static struct ceph_mon_generic_request *
512alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp)
513{
514 struct ceph_mon_generic_request *req;
515
516 req = kzalloc(sizeof(*req), gfp);
517 if (!req)
518 return NULL;
519
520 req->monc = monc;
521 kref_init(&req->kref);
522 RB_CLEAR_NODE(&req->node);
523 init_completion(&req->completion);
524
525 dout("%s greq %p\n", __func__, req);
526 return req;
527}
528
529static void register_generic_request(struct ceph_mon_generic_request *req)
530{
531 struct ceph_mon_client *monc = req->monc;
532
533 WARN_ON(req->tid);
534
535 get_generic_request(req);
536 req->tid = ++monc->last_tid;
537 insert_generic_request(&monc->generic_request_tree, req);
538}
539
540static void send_generic_request(struct ceph_mon_client *monc,
541 struct ceph_mon_generic_request *req)
542{
543 WARN_ON(!req->tid);
544
545 dout("%s greq %p tid %llu\n", __func__, req, req->tid);
546 req->request->hdr.tid = cpu_to_le64(req->tid);
547 ceph_con_send(&monc->con, ceph_msg_get(req->request));
548}
549
550static void __finish_generic_request(struct ceph_mon_generic_request *req)
551{
552 struct ceph_mon_client *monc = req->monc;
553
554 dout("%s greq %p tid %llu\n", __func__, req, req->tid);
555 erase_generic_request(&monc->generic_request_tree, req);
556
557 ceph_msg_revoke(req->request);
558 ceph_msg_revoke_incoming(req->reply);
559}
560
561static void finish_generic_request(struct ceph_mon_generic_request *req)
562{
563 __finish_generic_request(req);
564 put_generic_request(req);
565}
566
567static void complete_generic_request(struct ceph_mon_generic_request *req)
568{
569 if (req->complete_cb)
570 req->complete_cb(req);
571 else
572 complete_all(&req->completion);
573 put_generic_request(req);
574}
575
576void cancel_generic_request(struct ceph_mon_generic_request *req)
577{
578 struct ceph_mon_client *monc = req->monc;
579 struct ceph_mon_generic_request *lookup_req;
580
581 dout("%s greq %p tid %llu\n", __func__, req, req->tid);
582
583 mutex_lock(&monc->mutex);
584 lookup_req = lookup_generic_request(&monc->generic_request_tree,
585 req->tid);
586 if (lookup_req) {
587 WARN_ON(lookup_req != req);
588 finish_generic_request(req);
589 }
590
591 mutex_unlock(&monc->mutex);
592}
593
594static int wait_generic_request(struct ceph_mon_generic_request *req)
595{
596 int ret;
597
598 dout("%s greq %p tid %llu\n", __func__, req, req->tid);
599 ret = wait_for_completion_interruptible(&req->completion);
600 if (ret)
601 cancel_generic_request(req);
602 else
603 ret = req->result; /* completed */
604
605 return ret;
606}
607
544static struct ceph_msg *get_generic_reply(struct ceph_connection *con, 608static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
545 struct ceph_msg_header *hdr, 609 struct ceph_msg_header *hdr,
546 int *skip) 610 int *skip)
@@ -551,7 +615,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
551 struct ceph_msg *m; 615 struct ceph_msg *m;
552 616
553 mutex_lock(&monc->mutex); 617 mutex_lock(&monc->mutex);
554 req = __lookup_generic_req(monc, tid); 618 req = lookup_generic_request(&monc->generic_request_tree, tid);
555 if (!req) { 619 if (!req) {
556 dout("get_generic_reply %lld dne\n", tid); 620 dout("get_generic_reply %lld dne\n", tid);
557 *skip = 1; 621 *skip = 1;
@@ -570,42 +634,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
570 return m; 634 return m;
571} 635}
572 636
573static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
574 struct ceph_mon_generic_request *req)
575{
576 int err;
577
578 /* register request */
579 req->tid = tid != 0 ? tid : ++monc->last_tid;
580 req->request->hdr.tid = cpu_to_le64(req->tid);
581 __insert_generic_request(monc, req);
582 monc->num_generic_requests++;
583 ceph_con_send(&monc->con, ceph_msg_get(req->request));
584 mutex_unlock(&monc->mutex);
585
586 err = wait_for_completion_interruptible(&req->completion);
587
588 mutex_lock(&monc->mutex);
589 rb_erase(&req->node, &monc->generic_request_tree);
590 monc->num_generic_requests--;
591
592 if (!err)
593 err = req->result;
594 return err;
595}
596
597static int do_generic_request(struct ceph_mon_client *monc,
598 struct ceph_mon_generic_request *req)
599{
600 int err;
601
602 mutex_lock(&monc->mutex);
603 err = __do_generic_request(monc, 0, req);
604 mutex_unlock(&monc->mutex);
605
606 return err;
607}
608
609/* 637/*
610 * statfs 638 * statfs
611 */ 639 */
@@ -616,22 +644,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
616 struct ceph_mon_statfs_reply *reply = msg->front.iov_base; 644 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
617 u64 tid = le64_to_cpu(msg->hdr.tid); 645 u64 tid = le64_to_cpu(msg->hdr.tid);
618 646
647 dout("%s msg %p tid %llu\n", __func__, msg, tid);
648
619 if (msg->front.iov_len != sizeof(*reply)) 649 if (msg->front.iov_len != sizeof(*reply))
620 goto bad; 650 goto bad;
621 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
622 651
623 mutex_lock(&monc->mutex); 652 mutex_lock(&monc->mutex);
624 req = __lookup_generic_req(monc, tid); 653 req = lookup_generic_request(&monc->generic_request_tree, tid);
625 if (req) { 654 if (!req) {
626 *(struct ceph_statfs *)req->buf = reply->st; 655 mutex_unlock(&monc->mutex);
627 req->result = 0; 656 return;
628 get_generic_request(req);
629 } 657 }
658
659 req->result = 0;
660 *req->u.st = reply->st; /* struct */
661 __finish_generic_request(req);
630 mutex_unlock(&monc->mutex); 662 mutex_unlock(&monc->mutex);
631 if (req) { 663
632 complete_all(&req->completion); 664 complete_generic_request(req);
633 put_generic_request(req);
634 }
635 return; 665 return;
636 666
637bad: 667bad:
@@ -646,38 +676,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
646{ 676{
647 struct ceph_mon_generic_request *req; 677 struct ceph_mon_generic_request *req;
648 struct ceph_mon_statfs *h; 678 struct ceph_mon_statfs *h;
649 int err; 679 int ret = -ENOMEM;
650 680
651 req = kzalloc(sizeof(*req), GFP_NOFS); 681 req = alloc_generic_request(monc, GFP_NOFS);
652 if (!req) 682 if (!req)
653 return -ENOMEM; 683 goto out;
654
655 kref_init(&req->kref);
656 req->buf = buf;
657 init_completion(&req->completion);
658 684
659 err = -ENOMEM;
660 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, 685 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
661 true); 686 true);
662 if (!req->request) 687 if (!req->request)
663 goto out; 688 goto out;
664 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, 689
665 true); 690 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true);
666 if (!req->reply) 691 if (!req->reply)
667 goto out; 692 goto out;
668 693
694 req->u.st = buf;
695
696 mutex_lock(&monc->mutex);
697 register_generic_request(req);
669 /* fill out request */ 698 /* fill out request */
670 h = req->request->front.iov_base; 699 h = req->request->front.iov_base;
671 h->monhdr.have_version = 0; 700 h->monhdr.have_version = 0;
672 h->monhdr.session_mon = cpu_to_le16(-1); 701 h->monhdr.session_mon = cpu_to_le16(-1);
673 h->monhdr.session_mon_tid = 0; 702 h->monhdr.session_mon_tid = 0;
674 h->fsid = monc->monmap->fsid; 703 h->fsid = monc->monmap->fsid;
704 send_generic_request(monc, req);
705 mutex_unlock(&monc->mutex);
675 706
676 err = do_generic_request(monc, req); 707 ret = wait_generic_request(req);
677
678out: 708out:
679 put_generic_request(req); 709 put_generic_request(req);
680 return err; 710 return ret;
681} 711}
682EXPORT_SYMBOL(ceph_monc_do_statfs); 712EXPORT_SYMBOL(ceph_monc_do_statfs);
683 713
@@ -690,7 +720,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
690 void *end = p + msg->front_alloc_len; 720 void *end = p + msg->front_alloc_len;
691 u64 handle; 721 u64 handle;
692 722
693 dout("%s %p tid %llu\n", __func__, msg, tid); 723 dout("%s msg %p tid %llu\n", __func__, msg, tid);
694 724
695 ceph_decode_need(&p, end, 2*sizeof(u64), bad); 725 ceph_decode_need(&p, end, 2*sizeof(u64), bad);
696 handle = ceph_decode_64(&p); 726 handle = ceph_decode_64(&p);
@@ -698,77 +728,111 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
698 goto bad; 728 goto bad;
699 729
700 mutex_lock(&monc->mutex); 730 mutex_lock(&monc->mutex);
701 req = __lookup_generic_req(monc, handle); 731 req = lookup_generic_request(&monc->generic_request_tree, handle);
702 if (req) { 732 if (!req) {
703 *(u64 *)req->buf = ceph_decode_64(&p); 733 mutex_unlock(&monc->mutex);
704 req->result = 0; 734 return;
705 get_generic_request(req);
706 } 735 }
736
737 req->result = 0;
738 req->u.newest = ceph_decode_64(&p);
739 __finish_generic_request(req);
707 mutex_unlock(&monc->mutex); 740 mutex_unlock(&monc->mutex);
708 if (req) {
709 complete_all(&req->completion);
710 put_generic_request(req);
711 }
712 741
742 complete_generic_request(req);
713 return; 743 return;
744
714bad: 745bad:
715 pr_err("corrupt mon_get_version reply, tid %llu\n", tid); 746 pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
716 ceph_msg_dump(msg); 747 ceph_msg_dump(msg);
717} 748}
718 749
719/* 750static struct ceph_mon_generic_request *
720 * Send MMonGetVersion and wait for the reply. 751__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
721 * 752 ceph_monc_callback_t cb, u64 private_data)
722 * @what: one of "mdsmap", "osdmap" or "monmap"
723 */
724int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
725 u64 *newest)
726{ 753{
727 struct ceph_mon_generic_request *req; 754 struct ceph_mon_generic_request *req;
728 void *p, *end;
729 u64 tid;
730 int err;
731 755
732 req = kzalloc(sizeof(*req), GFP_NOFS); 756 req = alloc_generic_request(monc, GFP_NOIO);
733 if (!req) 757 if (!req)
734 return -ENOMEM; 758 goto err_put_req;
735
736 kref_init(&req->kref);
737 req->buf = newest;
738 init_completion(&req->completion);
739 759
740 req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, 760 req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
741 sizeof(u64) + sizeof(u32) + strlen(what), 761 sizeof(u64) + sizeof(u32) + strlen(what),
742 GFP_NOFS, true); 762 GFP_NOIO, true);
743 if (!req->request) { 763 if (!req->request)
744 err = -ENOMEM; 764 goto err_put_req;
745 goto out;
746 }
747 765
748 req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024, 766 req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO,
749 GFP_NOFS, true); 767 true);
750 if (!req->reply) { 768 if (!req->reply)
751 err = -ENOMEM; 769 goto err_put_req;
752 goto out;
753 }
754 770
755 p = req->request->front.iov_base; 771 req->complete_cb = cb;
756 end = p + req->request->front_alloc_len; 772 req->private_data = private_data;
757 773
758 /* fill out request */
759 mutex_lock(&monc->mutex); 774 mutex_lock(&monc->mutex);
760 tid = ++monc->last_tid; 775 register_generic_request(req);
761 ceph_encode_64(&p, tid); /* handle */ 776 {
762 ceph_encode_string(&p, end, what, strlen(what)); 777 void *p = req->request->front.iov_base;
778 void *const end = p + req->request->front_alloc_len;
779
780 ceph_encode_64(&p, req->tid); /* handle */
781 ceph_encode_string(&p, end, what, strlen(what));
782 WARN_ON(p != end);
783 }
784 send_generic_request(monc, req);
785 mutex_unlock(&monc->mutex);
763 786
764 err = __do_generic_request(monc, tid, req); 787 return req;
765 788
766 mutex_unlock(&monc->mutex); 789err_put_req:
767out:
768 put_generic_request(req); 790 put_generic_request(req);
769 return err; 791 return ERR_PTR(-ENOMEM);
792}
793
794/*
795 * Send MMonGetVersion and wait for the reply.
796 *
797 * @what: one of "mdsmap", "osdmap" or "monmap"
798 */
799int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
800 u64 *newest)
801{
802 struct ceph_mon_generic_request *req;
803 int ret;
804
805 req = __ceph_monc_get_version(monc, what, NULL, 0);
806 if (IS_ERR(req))
807 return PTR_ERR(req);
808
809 ret = wait_generic_request(req);
810 if (!ret)
811 *newest = req->u.newest;
812
813 put_generic_request(req);
814 return ret;
770} 815}
771EXPORT_SYMBOL(ceph_monc_do_get_version); 816EXPORT_SYMBOL(ceph_monc_get_version);
817
818/*
819 * Send MMonGetVersion,
820 *
821 * @what: one of "mdsmap", "osdmap" or "monmap"
822 */
823int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
824 ceph_monc_callback_t cb, u64 private_data)
825{
826 struct ceph_mon_generic_request *req;
827
828 req = __ceph_monc_get_version(monc, what, cb, private_data);
829 if (IS_ERR(req))
830 return PTR_ERR(req);
831
832 put_generic_request(req);
833 return 0;
834}
835EXPORT_SYMBOL(ceph_monc_get_version_async);
772 836
773/* 837/*
774 * Resend pending generic requests. 838 * Resend pending generic requests.
@@ -890,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
890 if (!monc->m_subscribe_ack) 954 if (!monc->m_subscribe_ack)
891 goto out_auth; 955 goto out_auth;
892 956
893 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, 957 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
894 true); 958 true);
895 if (!monc->m_subscribe) 959 if (!monc->m_subscribe)
896 goto out_subscribe_ack; 960 goto out_subscribe_ack;
@@ -914,9 +978,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
914 978
915 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); 979 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
916 monc->generic_request_tree = RB_ROOT; 980 monc->generic_request_tree = RB_ROOT;
917 monc->num_generic_requests = 0;
918 monc->last_tid = 0; 981 monc->last_tid = 0;
919 982
983 monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
984
920 return 0; 985 return 0;
921 986
922out_auth_reply: 987out_auth_reply:
@@ -954,6 +1019,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
954 1019
955 ceph_auth_destroy(monc->auth); 1020 ceph_auth_destroy(monc->auth);
956 1021
1022 WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree));
1023
957 ceph_msg_put(monc->m_auth); 1024 ceph_msg_put(monc->m_auth);
958 ceph_msg_put(monc->m_auth_reply); 1025 ceph_msg_put(monc->m_auth_reply);
959 ceph_msg_put(monc->m_subscribe); 1026 ceph_msg_put(monc->m_subscribe);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 40a53a70efdf..0160d7d09a1e 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -19,25 +19,12 @@
19#include <linux/ceph/auth.h> 19#include <linux/ceph/auth.h>
20#include <linux/ceph/pagelist.h> 20#include <linux/ceph/pagelist.h>
21 21
22#define OSD_OP_FRONT_LEN 4096
23#define OSD_OPREPLY_FRONT_LEN 512 22#define OSD_OPREPLY_FRONT_LEN 512
24 23
25static struct kmem_cache *ceph_osd_request_cache; 24static struct kmem_cache *ceph_osd_request_cache;
26 25
27static const struct ceph_connection_operations osd_con_ops; 26static const struct ceph_connection_operations osd_con_ops;
28 27
29static void __send_queued(struct ceph_osd_client *osdc);
30static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
31static void __register_request(struct ceph_osd_client *osdc,
32 struct ceph_osd_request *req);
33static void __unregister_request(struct ceph_osd_client *osdc,
34 struct ceph_osd_request *req);
35static void __unregister_linger_request(struct ceph_osd_client *osdc,
36 struct ceph_osd_request *req);
37static void __enqueue_request(struct ceph_osd_request *req);
38static void __send_request(struct ceph_osd_client *osdc,
39 struct ceph_osd_request *req);
40
41/* 28/*
42 * Implement client access to distributed object storage cluster. 29 * Implement client access to distributed object storage cluster.
43 * 30 *
@@ -56,6 +43,52 @@ static void __send_request(struct ceph_osd_client *osdc,
56 * channel with an OSD is reset. 43 * channel with an OSD is reset.
57 */ 44 */
58 45
46static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
47static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
48static void link_linger(struct ceph_osd *osd,
49 struct ceph_osd_linger_request *lreq);
50static void unlink_linger(struct ceph_osd *osd,
51 struct ceph_osd_linger_request *lreq);
52
53#if 1
54static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
55{
56 bool wrlocked = true;
57
58 if (unlikely(down_read_trylock(sem))) {
59 wrlocked = false;
60 up_read(sem);
61 }
62
63 return wrlocked;
64}
65static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
66{
67 WARN_ON(!rwsem_is_locked(&osdc->lock));
68}
69static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
70{
71 WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
72}
73static inline void verify_osd_locked(struct ceph_osd *osd)
74{
75 struct ceph_osd_client *osdc = osd->o_osdc;
76
77 WARN_ON(!(mutex_is_locked(&osd->lock) &&
78 rwsem_is_locked(&osdc->lock)) &&
79 !rwsem_is_wrlocked(&osdc->lock));
80}
81static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
82{
83 WARN_ON(!mutex_is_locked(&lreq->lock));
84}
85#else
86static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
87static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
88static inline void verify_osd_locked(struct ceph_osd *osd) { }
89static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
90#endif
91
59/* 92/*
60 * calculate the mapping of a file extent onto an object, and fill out the 93 * calculate the mapping of a file extent onto an object, and fill out the
61 * request accordingly. shorten extent as necessary if it crosses an 94 * request accordingly. shorten extent as necessary if it crosses an
@@ -144,14 +177,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
144} 177}
145EXPORT_SYMBOL(osd_req_op_extent_osd_data); 178EXPORT_SYMBOL(osd_req_op_extent_osd_data);
146 179
147struct ceph_osd_data *
148osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
149 unsigned int which)
150{
151 return osd_req_op_data(osd_req, which, cls, response_data);
152}
153EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */
154
155void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req, 180void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
156 unsigned int which, struct page **pages, 181 unsigned int which, struct page **pages,
157 u64 length, u32 alignment, 182 u64 length, u32 alignment,
@@ -218,6 +243,8 @@ void osd_req_op_cls_request_data_pagelist(
218 243
219 osd_data = osd_req_op_data(osd_req, which, cls, request_data); 244 osd_data = osd_req_op_data(osd_req, which, cls, request_data);
220 ceph_osd_data_pagelist_init(osd_data, pagelist); 245 ceph_osd_data_pagelist_init(osd_data, pagelist);
246 osd_req->r_ops[which].cls.indata_len += pagelist->length;
247 osd_req->r_ops[which].indata_len += pagelist->length;
221} 248}
222EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); 249EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
223 250
@@ -230,6 +257,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
230 osd_data = osd_req_op_data(osd_req, which, cls, request_data); 257 osd_data = osd_req_op_data(osd_req, which, cls, request_data);
231 ceph_osd_data_pages_init(osd_data, pages, length, alignment, 258 ceph_osd_data_pages_init(osd_data, pages, length, alignment,
232 pages_from_pool, own_pages); 259 pages_from_pool, own_pages);
260 osd_req->r_ops[which].cls.indata_len += length;
261 osd_req->r_ops[which].indata_len += length;
233} 262}
234EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); 263EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
235 264
@@ -302,14 +331,76 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
302 case CEPH_OSD_OP_STAT: 331 case CEPH_OSD_OP_STAT:
303 ceph_osd_data_release(&op->raw_data_in); 332 ceph_osd_data_release(&op->raw_data_in);
304 break; 333 break;
334 case CEPH_OSD_OP_NOTIFY_ACK:
335 ceph_osd_data_release(&op->notify_ack.request_data);
336 break;
337 case CEPH_OSD_OP_NOTIFY:
338 ceph_osd_data_release(&op->notify.request_data);
339 ceph_osd_data_release(&op->notify.response_data);
340 break;
305 default: 341 default:
306 break; 342 break;
307 } 343 }
308} 344}
309 345
310/* 346/*
347 * Assumes @t is zero-initialized.
348 */
349static void target_init(struct ceph_osd_request_target *t)
350{
351 ceph_oid_init(&t->base_oid);
352 ceph_oloc_init(&t->base_oloc);
353 ceph_oid_init(&t->target_oid);
354 ceph_oloc_init(&t->target_oloc);
355
356 ceph_osds_init(&t->acting);
357 ceph_osds_init(&t->up);
358 t->size = -1;
359 t->min_size = -1;
360
361 t->osd = CEPH_HOMELESS_OSD;
362}
363
364static void target_copy(struct ceph_osd_request_target *dest,
365 const struct ceph_osd_request_target *src)
366{
367 ceph_oid_copy(&dest->base_oid, &src->base_oid);
368 ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
369 ceph_oid_copy(&dest->target_oid, &src->target_oid);
370 ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
371
372 dest->pgid = src->pgid; /* struct */
373 dest->pg_num = src->pg_num;
374 dest->pg_num_mask = src->pg_num_mask;
375 ceph_osds_copy(&dest->acting, &src->acting);
376 ceph_osds_copy(&dest->up, &src->up);
377 dest->size = src->size;
378 dest->min_size = src->min_size;
379 dest->sort_bitwise = src->sort_bitwise;
380
381 dest->flags = src->flags;
382 dest->paused = src->paused;
383
384 dest->osd = src->osd;
385}
386
387static void target_destroy(struct ceph_osd_request_target *t)
388{
389 ceph_oid_destroy(&t->base_oid);
390 ceph_oid_destroy(&t->target_oid);
391}
392
393/*
311 * requests 394 * requests
312 */ 395 */
396static void request_release_checks(struct ceph_osd_request *req)
397{
398 WARN_ON(!RB_EMPTY_NODE(&req->r_node));
399 WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
400 WARN_ON(!list_empty(&req->r_unsafe_item));
401 WARN_ON(req->r_osd);
402}
403
313static void ceph_osdc_release_request(struct kref *kref) 404static void ceph_osdc_release_request(struct kref *kref)
314{ 405{
315 struct ceph_osd_request *req = container_of(kref, 406 struct ceph_osd_request *req = container_of(kref,
@@ -318,24 +409,19 @@ static void ceph_osdc_release_request(struct kref *kref)
318 409
319 dout("%s %p (r_request %p r_reply %p)\n", __func__, req, 410 dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
320 req->r_request, req->r_reply); 411 req->r_request, req->r_reply);
321 WARN_ON(!RB_EMPTY_NODE(&req->r_node)); 412 request_release_checks(req);
322 WARN_ON(!list_empty(&req->r_req_lru_item));
323 WARN_ON(!list_empty(&req->r_osd_item));
324 WARN_ON(!list_empty(&req->r_linger_item));
325 WARN_ON(!list_empty(&req->r_linger_osd_item));
326 WARN_ON(req->r_osd);
327 413
328 if (req->r_request) 414 if (req->r_request)
329 ceph_msg_put(req->r_request); 415 ceph_msg_put(req->r_request);
330 if (req->r_reply) { 416 if (req->r_reply)
331 ceph_msg_revoke_incoming(req->r_reply);
332 ceph_msg_put(req->r_reply); 417 ceph_msg_put(req->r_reply);
333 }
334 418
335 for (which = 0; which < req->r_num_ops; which++) 419 for (which = 0; which < req->r_num_ops; which++)
336 osd_req_op_data_release(req, which); 420 osd_req_op_data_release(req, which);
337 421
422 target_destroy(&req->r_t);
338 ceph_put_snap_context(req->r_snapc); 423 ceph_put_snap_context(req->r_snapc);
424
339 if (req->r_mempool) 425 if (req->r_mempool)
340 mempool_free(req, req->r_osdc->req_mempool); 426 mempool_free(req, req->r_osdc->req_mempool);
341 else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS) 427 else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
@@ -354,12 +440,66 @@ EXPORT_SYMBOL(ceph_osdc_get_request);
354 440
355void ceph_osdc_put_request(struct ceph_osd_request *req) 441void ceph_osdc_put_request(struct ceph_osd_request *req)
356{ 442{
357 dout("%s %p (was %d)\n", __func__, req, 443 if (req) {
358 atomic_read(&req->r_kref.refcount)); 444 dout("%s %p (was %d)\n", __func__, req,
359 kref_put(&req->r_kref, ceph_osdc_release_request); 445 atomic_read(&req->r_kref.refcount));
446 kref_put(&req->r_kref, ceph_osdc_release_request);
447 }
360} 448}
361EXPORT_SYMBOL(ceph_osdc_put_request); 449EXPORT_SYMBOL(ceph_osdc_put_request);
362 450
451static void request_init(struct ceph_osd_request *req)
452{
453 /* req only, each op is zeroed in _osd_req_op_init() */
454 memset(req, 0, sizeof(*req));
455
456 kref_init(&req->r_kref);
457 init_completion(&req->r_completion);
458 init_completion(&req->r_safe_completion);
459 RB_CLEAR_NODE(&req->r_node);
460 RB_CLEAR_NODE(&req->r_mc_node);
461 INIT_LIST_HEAD(&req->r_unsafe_item);
462
463 target_init(&req->r_t);
464}
465
466/*
467 * This is ugly, but it allows us to reuse linger registration and ping
468 * requests, keeping the structure of the code around send_linger{_ping}()
469 * reasonable. Setting up a min_nr=2 mempool for each linger request
470 * and dealing with copying ops (this blasts req only, watch op remains
471 * intact) isn't any better.
472 */
473static void request_reinit(struct ceph_osd_request *req)
474{
475 struct ceph_osd_client *osdc = req->r_osdc;
476 bool mempool = req->r_mempool;
477 unsigned int num_ops = req->r_num_ops;
478 u64 snapid = req->r_snapid;
479 struct ceph_snap_context *snapc = req->r_snapc;
480 bool linger = req->r_linger;
481 struct ceph_msg *request_msg = req->r_request;
482 struct ceph_msg *reply_msg = req->r_reply;
483
484 dout("%s req %p\n", __func__, req);
485 WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
486 request_release_checks(req);
487
488 WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
489 WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
490 target_destroy(&req->r_t);
491
492 request_init(req);
493 req->r_osdc = osdc;
494 req->r_mempool = mempool;
495 req->r_num_ops = num_ops;
496 req->r_snapid = snapid;
497 req->r_snapc = snapc;
498 req->r_linger = linger;
499 req->r_request = request_msg;
500 req->r_reply = reply_msg;
501}
502
363struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 503struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
364 struct ceph_snap_context *snapc, 504 struct ceph_snap_context *snapc,
365 unsigned int num_ops, 505 unsigned int num_ops,
@@ -367,8 +507,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
367 gfp_t gfp_flags) 507 gfp_t gfp_flags)
368{ 508{
369 struct ceph_osd_request *req; 509 struct ceph_osd_request *req;
370 struct ceph_msg *msg;
371 size_t msg_size;
372 510
373 if (use_mempool) { 511 if (use_mempool) {
374 BUG_ON(num_ops > CEPH_OSD_SLAB_OPS); 512 BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
@@ -383,73 +521,65 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
383 if (unlikely(!req)) 521 if (unlikely(!req))
384 return NULL; 522 return NULL;
385 523
386 /* req only, each op is zeroed in _osd_req_op_init() */ 524 request_init(req);
387 memset(req, 0, sizeof(*req));
388
389 req->r_osdc = osdc; 525 req->r_osdc = osdc;
390 req->r_mempool = use_mempool; 526 req->r_mempool = use_mempool;
391 req->r_num_ops = num_ops; 527 req->r_num_ops = num_ops;
528 req->r_snapid = CEPH_NOSNAP;
529 req->r_snapc = ceph_get_snap_context(snapc);
392 530
393 kref_init(&req->r_kref); 531 dout("%s req %p\n", __func__, req);
394 init_completion(&req->r_completion); 532 return req;
395 init_completion(&req->r_safe_completion); 533}
396 RB_CLEAR_NODE(&req->r_node); 534EXPORT_SYMBOL(ceph_osdc_alloc_request);
397 INIT_LIST_HEAD(&req->r_unsafe_item);
398 INIT_LIST_HEAD(&req->r_linger_item);
399 INIT_LIST_HEAD(&req->r_linger_osd_item);
400 INIT_LIST_HEAD(&req->r_req_lru_item);
401 INIT_LIST_HEAD(&req->r_osd_item);
402
403 req->r_base_oloc.pool = -1;
404 req->r_target_oloc.pool = -1;
405 535
406 msg_size = OSD_OPREPLY_FRONT_LEN; 536int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
407 if (num_ops > CEPH_OSD_SLAB_OPS) { 537{
408 /* ceph_osd_op and rval */ 538 struct ceph_osd_client *osdc = req->r_osdc;
409 msg_size += (num_ops - CEPH_OSD_SLAB_OPS) * 539 struct ceph_msg *msg;
410 (sizeof(struct ceph_osd_op) + 4); 540 int msg_size;
411 }
412 541
413 /* create reply message */ 542 WARN_ON(ceph_oid_empty(&req->r_base_oid));
414 if (use_mempool)
415 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
416 else
417 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
418 gfp_flags, true);
419 if (!msg) {
420 ceph_osdc_put_request(req);
421 return NULL;
422 }
423 req->r_reply = msg;
424 543
544 /* create request message */
425 msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ 545 msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
426 msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ 546 msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
427 msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ 547 msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
428 msg_size += 1 + 8 + 4 + 4; /* pgid */ 548 msg_size += 1 + 8 + 4 + 4; /* pgid */
429 msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ 549 msg_size += 4 + req->r_base_oid.name_len; /* oid */
430 msg_size += 2 + num_ops * sizeof(struct ceph_osd_op); 550 msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
431 msg_size += 8; /* snapid */ 551 msg_size += 8; /* snapid */
432 msg_size += 8; /* snap_seq */ 552 msg_size += 8; /* snap_seq */
433 msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ 553 msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
434 msg_size += 4; /* retry_attempt */ 554 msg_size += 4; /* retry_attempt */
435 555
436 /* create request message; allow space for oid */ 556 if (req->r_mempool)
437 if (use_mempool)
438 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 557 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
439 else 558 else
440 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true); 559 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
441 if (!msg) { 560 if (!msg)
442 ceph_osdc_put_request(req); 561 return -ENOMEM;
443 return NULL;
444 }
445 562
446 memset(msg->front.iov_base, 0, msg->front.iov_len); 563 memset(msg->front.iov_base, 0, msg->front.iov_len);
447
448 req->r_request = msg; 564 req->r_request = msg;
449 565
450 return req; 566 /* create reply message */
567 msg_size = OSD_OPREPLY_FRONT_LEN;
568 msg_size += req->r_base_oid.name_len;
569 msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
570
571 if (req->r_mempool)
572 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
573 else
574 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
575 if (!msg)
576 return -ENOMEM;
577
578 req->r_reply = msg;
579
580 return 0;
451} 581}
452EXPORT_SYMBOL(ceph_osdc_alloc_request); 582EXPORT_SYMBOL(ceph_osdc_alloc_messages);
453 583
454static bool osd_req_opcode_valid(u16 opcode) 584static bool osd_req_opcode_valid(u16 opcode)
455{ 585{
@@ -587,8 +717,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
587 717
588 osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); 718 osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
589 719
590 op->cls.argc = 0; /* currently unused */
591
592 op->indata_len = payload_len; 720 op->indata_len = payload_len;
593} 721}
594EXPORT_SYMBOL(osd_req_op_cls_init); 722EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -627,21 +755,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
627} 755}
628EXPORT_SYMBOL(osd_req_op_xattr_init); 756EXPORT_SYMBOL(osd_req_op_xattr_init);
629 757
630void osd_req_op_watch_init(struct ceph_osd_request *osd_req, 758/*
631 unsigned int which, u16 opcode, 759 * @watch_opcode: CEPH_OSD_WATCH_OP_*
632 u64 cookie, u64 version, int flag) 760 */
761static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
762 u64 cookie, u8 watch_opcode)
633{ 763{
634 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, 764 struct ceph_osd_req_op *op;
635 opcode, 0);
636
637 BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
638 765
766 op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
639 op->watch.cookie = cookie; 767 op->watch.cookie = cookie;
640 op->watch.ver = version; 768 op->watch.op = watch_opcode;
641 if (opcode == CEPH_OSD_OP_WATCH && flag) 769 op->watch.gen = 0;
642 op->watch.flag = (u8)1;
643} 770}
644EXPORT_SYMBOL(osd_req_op_watch_init);
645 771
646void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, 772void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
647 unsigned int which, 773 unsigned int which,
@@ -686,16 +812,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
686 } 812 }
687} 813}
688 814
689static u64 osd_req_encode_op(struct ceph_osd_request *req, 815static u32 osd_req_encode_op(struct ceph_osd_op *dst,
690 struct ceph_osd_op *dst, unsigned int which) 816 const struct ceph_osd_req_op *src)
691{ 817{
692 struct ceph_osd_req_op *src;
693 struct ceph_osd_data *osd_data;
694 u64 request_data_len = 0;
695 u64 data_length;
696
697 BUG_ON(which >= req->r_num_ops);
698 src = &req->r_ops[which];
699 if (WARN_ON(!osd_req_opcode_valid(src->op))) { 818 if (WARN_ON(!osd_req_opcode_valid(src->op))) {
700 pr_err("unrecognized osd opcode %d\n", src->op); 819 pr_err("unrecognized osd opcode %d\n", src->op);
701 820
@@ -704,57 +823,36 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
704 823
705 switch (src->op) { 824 switch (src->op) {
706 case CEPH_OSD_OP_STAT: 825 case CEPH_OSD_OP_STAT:
707 osd_data = &src->raw_data_in;
708 ceph_osdc_msg_data_add(req->r_reply, osd_data);
709 break; 826 break;
710 case CEPH_OSD_OP_READ: 827 case CEPH_OSD_OP_READ:
711 case CEPH_OSD_OP_WRITE: 828 case CEPH_OSD_OP_WRITE:
712 case CEPH_OSD_OP_WRITEFULL: 829 case CEPH_OSD_OP_WRITEFULL:
713 case CEPH_OSD_OP_ZERO: 830 case CEPH_OSD_OP_ZERO:
714 case CEPH_OSD_OP_TRUNCATE: 831 case CEPH_OSD_OP_TRUNCATE:
715 if (src->op == CEPH_OSD_OP_WRITE ||
716 src->op == CEPH_OSD_OP_WRITEFULL)
717 request_data_len = src->extent.length;
718 dst->extent.offset = cpu_to_le64(src->extent.offset); 832 dst->extent.offset = cpu_to_le64(src->extent.offset);
719 dst->extent.length = cpu_to_le64(src->extent.length); 833 dst->extent.length = cpu_to_le64(src->extent.length);
720 dst->extent.truncate_size = 834 dst->extent.truncate_size =
721 cpu_to_le64(src->extent.truncate_size); 835 cpu_to_le64(src->extent.truncate_size);
722 dst->extent.truncate_seq = 836 dst->extent.truncate_seq =
723 cpu_to_le32(src->extent.truncate_seq); 837 cpu_to_le32(src->extent.truncate_seq);
724 osd_data = &src->extent.osd_data;
725 if (src->op == CEPH_OSD_OP_WRITE ||
726 src->op == CEPH_OSD_OP_WRITEFULL)
727 ceph_osdc_msg_data_add(req->r_request, osd_data);
728 else
729 ceph_osdc_msg_data_add(req->r_reply, osd_data);
730 break; 838 break;
731 case CEPH_OSD_OP_CALL: 839 case CEPH_OSD_OP_CALL:
732 dst->cls.class_len = src->cls.class_len; 840 dst->cls.class_len = src->cls.class_len;
733 dst->cls.method_len = src->cls.method_len; 841 dst->cls.method_len = src->cls.method_len;
734 osd_data = &src->cls.request_info; 842 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
735 ceph_osdc_msg_data_add(req->r_request, osd_data);
736 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
737 request_data_len = osd_data->pagelist->length;
738
739 osd_data = &src->cls.request_data;
740 data_length = ceph_osd_data_length(osd_data);
741 if (data_length) {
742 BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
743 dst->cls.indata_len = cpu_to_le32(data_length);
744 ceph_osdc_msg_data_add(req->r_request, osd_data);
745 src->indata_len += data_length;
746 request_data_len += data_length;
747 }
748 osd_data = &src->cls.response_data;
749 ceph_osdc_msg_data_add(req->r_reply, osd_data);
750 break; 843 break;
751 case CEPH_OSD_OP_STARTSYNC: 844 case CEPH_OSD_OP_STARTSYNC:
752 break; 845 break;
753 case CEPH_OSD_OP_NOTIFY_ACK:
754 case CEPH_OSD_OP_WATCH: 846 case CEPH_OSD_OP_WATCH:
755 dst->watch.cookie = cpu_to_le64(src->watch.cookie); 847 dst->watch.cookie = cpu_to_le64(src->watch.cookie);
756 dst->watch.ver = cpu_to_le64(src->watch.ver); 848 dst->watch.ver = cpu_to_le64(0);
757 dst->watch.flag = src->watch.flag; 849 dst->watch.op = src->watch.op;
850 dst->watch.gen = cpu_to_le32(src->watch.gen);
851 break;
852 case CEPH_OSD_OP_NOTIFY_ACK:
853 break;
854 case CEPH_OSD_OP_NOTIFY:
855 dst->notify.cookie = cpu_to_le64(src->notify.cookie);
758 break; 856 break;
759 case CEPH_OSD_OP_SETALLOCHINT: 857 case CEPH_OSD_OP_SETALLOCHINT:
760 dst->alloc_hint.expected_object_size = 858 dst->alloc_hint.expected_object_size =
@@ -768,9 +866,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
768 dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); 866 dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
769 dst->xattr.cmp_op = src->xattr.cmp_op; 867 dst->xattr.cmp_op = src->xattr.cmp_op;
770 dst->xattr.cmp_mode = src->xattr.cmp_mode; 868 dst->xattr.cmp_mode = src->xattr.cmp_mode;
771 osd_data = &src->xattr.osd_data;
772 ceph_osdc_msg_data_add(req->r_request, osd_data);
773 request_data_len = osd_data->pagelist->length;
774 break; 869 break;
775 case CEPH_OSD_OP_CREATE: 870 case CEPH_OSD_OP_CREATE:
776 case CEPH_OSD_OP_DELETE: 871 case CEPH_OSD_OP_DELETE:
@@ -787,7 +882,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
787 dst->flags = cpu_to_le32(src->flags); 882 dst->flags = cpu_to_le32(src->flags);
788 dst->payload_len = cpu_to_le32(src->indata_len); 883 dst->payload_len = cpu_to_le32(src->indata_len);
789 884
790 return request_data_len; 885 return src->indata_len;
791} 886}
792 887
793/* 888/*
@@ -824,17 +919,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
824 919
825 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, 920 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
826 GFP_NOFS); 921 GFP_NOFS);
827 if (!req) 922 if (!req) {
828 return ERR_PTR(-ENOMEM); 923 r = -ENOMEM;
829 924 goto fail;
830 req->r_flags = flags; 925 }
831 926
832 /* calculate max write size */ 927 /* calculate max write size */
833 r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen); 928 r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
834 if (r < 0) { 929 if (r)
835 ceph_osdc_put_request(req); 930 goto fail;
836 return ERR_PTR(r);
837 }
838 931
839 if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { 932 if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
840 osd_req_op_init(req, which, opcode, 0); 933 osd_req_op_init(req, which, opcode, 0);
@@ -854,194 +947,71 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
854 truncate_size, truncate_seq); 947 truncate_size, truncate_seq);
855 } 948 }
856 949
950 req->r_flags = flags;
857 req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); 951 req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
952 ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
858 953
859 snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name), 954 req->r_snapid = vino.snap;
860 "%llx.%08llx", vino.ino, objnum); 955 if (flags & CEPH_OSD_FLAG_WRITE)
861 req->r_base_oid.name_len = strlen(req->r_base_oid.name); 956 req->r_data_offset = off;
957
958 r = ceph_osdc_alloc_messages(req, GFP_NOFS);
959 if (r)
960 goto fail;
862 961
863 return req; 962 return req;
963
964fail:
965 ceph_osdc_put_request(req);
966 return ERR_PTR(r);
864} 967}
865EXPORT_SYMBOL(ceph_osdc_new_request); 968EXPORT_SYMBOL(ceph_osdc_new_request);
866 969
867/* 970/*
868 * We keep osd requests in an rbtree, sorted by ->r_tid. 971 * We keep osd requests in an rbtree, sorted by ->r_tid.
869 */ 972 */
870static void __insert_request(struct ceph_osd_client *osdc, 973DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
871 struct ceph_osd_request *new) 974DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
872{
873 struct rb_node **p = &osdc->requests.rb_node;
874 struct rb_node *parent = NULL;
875 struct ceph_osd_request *req = NULL;
876
877 while (*p) {
878 parent = *p;
879 req = rb_entry(parent, struct ceph_osd_request, r_node);
880 if (new->r_tid < req->r_tid)
881 p = &(*p)->rb_left;
882 else if (new->r_tid > req->r_tid)
883 p = &(*p)->rb_right;
884 else
885 BUG();
886 }
887
888 rb_link_node(&new->r_node, parent, p);
889 rb_insert_color(&new->r_node, &osdc->requests);
890}
891
892static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
893 u64 tid)
894{
895 struct ceph_osd_request *req;
896 struct rb_node *n = osdc->requests.rb_node;
897
898 while (n) {
899 req = rb_entry(n, struct ceph_osd_request, r_node);
900 if (tid < req->r_tid)
901 n = n->rb_left;
902 else if (tid > req->r_tid)
903 n = n->rb_right;
904 else
905 return req;
906 }
907 return NULL;
908}
909 975
910static struct ceph_osd_request * 976static bool osd_homeless(struct ceph_osd *osd)
911__lookup_request_ge(struct ceph_osd_client *osdc,
912 u64 tid)
913{ 977{
914 struct ceph_osd_request *req; 978 return osd->o_osd == CEPH_HOMELESS_OSD;
915 struct rb_node *n = osdc->requests.rb_node;
916
917 while (n) {
918 req = rb_entry(n, struct ceph_osd_request, r_node);
919 if (tid < req->r_tid) {
920 if (!n->rb_left)
921 return req;
922 n = n->rb_left;
923 } else if (tid > req->r_tid) {
924 n = n->rb_right;
925 } else {
926 return req;
927 }
928 }
929 return NULL;
930} 979}
931 980
932static void __kick_linger_request(struct ceph_osd_request *req) 981static bool osd_registered(struct ceph_osd *osd)
933{ 982{
934 struct ceph_osd_client *osdc = req->r_osdc; 983 verify_osdc_locked(osd->o_osdc);
935 struct ceph_osd *osd = req->r_osd;
936
937 /*
938 * Linger requests need to be resent with a new tid to avoid
939 * the dup op detection logic on the OSDs. Achieve this with
940 * a re-register dance instead of open-coding.
941 */
942 ceph_osdc_get_request(req);
943 if (!list_empty(&req->r_linger_item))
944 __unregister_linger_request(osdc, req);
945 else
946 __unregister_request(osdc, req);
947 __register_request(osdc, req);
948 ceph_osdc_put_request(req);
949
950 /*
951 * Unless request has been registered as both normal and
952 * lingering, __unregister{,_linger}_request clears r_osd.
953 * However, here we need to preserve r_osd to make sure we
954 * requeue on the same OSD.
955 */
956 WARN_ON(req->r_osd || !osd);
957 req->r_osd = osd;
958 984
959 dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid); 985 return !RB_EMPTY_NODE(&osd->o_node);
960 __enqueue_request(req);
961} 986}
962 987
963/* 988/*
964 * Resubmit requests pending on the given osd. 989 * Assumes @osd is zero-initialized.
965 */ 990 */
966static void __kick_osd_requests(struct ceph_osd_client *osdc, 991static void osd_init(struct ceph_osd *osd)
967 struct ceph_osd *osd)
968{ 992{
969 struct ceph_osd_request *req, *nreq; 993 atomic_set(&osd->o_ref, 1);
970 LIST_HEAD(resend); 994 RB_CLEAR_NODE(&osd->o_node);
971 LIST_HEAD(resend_linger); 995 osd->o_requests = RB_ROOT;
972 int err; 996 osd->o_linger_requests = RB_ROOT;
973 997 INIT_LIST_HEAD(&osd->o_osd_lru);
974 dout("%s osd%d\n", __func__, osd->o_osd); 998 INIT_LIST_HEAD(&osd->o_keepalive_item);
975 err = __reset_osd(osdc, osd); 999 osd->o_incarnation = 1;
976 if (err) 1000 mutex_init(&osd->lock);
977 return;
978
979 /*
980 * Build up a list of requests to resend by traversing the
981 * osd's list of requests. Requests for a given object are
982 * sent in tid order, and that is also the order they're
983 * kept on this list. Therefore all requests that are in
984 * flight will be found first, followed by all requests that
985 * have not yet been sent. And to resend requests while
986 * preserving this order we will want to put any sent
987 * requests back on the front of the osd client's unsent
988 * list.
989 *
990 * So we build a separate ordered list of already-sent
991 * requests for the affected osd and splice it onto the
992 * front of the osd client's unsent list. Once we've seen a
993 * request that has not yet been sent we're done. Those
994 * requests are already sitting right where they belong.
995 */
996 list_for_each_entry(req, &osd->o_requests, r_osd_item) {
997 if (!req->r_sent)
998 break;
999
1000 if (!req->r_linger) {
1001 dout("%s requeueing %p tid %llu\n", __func__, req,
1002 req->r_tid);
1003 list_move_tail(&req->r_req_lru_item, &resend);
1004 req->r_flags |= CEPH_OSD_FLAG_RETRY;
1005 } else {
1006 list_move_tail(&req->r_req_lru_item, &resend_linger);
1007 }
1008 }
1009 list_splice(&resend, &osdc->req_unsent);
1010
1011 /*
1012 * Both registered and not yet registered linger requests are
1013 * enqueued with a new tid on the same OSD. We add/move them
1014 * to req_unsent/o_requests at the end to keep things in tid
1015 * order.
1016 */
1017 list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
1018 r_linger_osd_item) {
1019 WARN_ON(!list_empty(&req->r_req_lru_item));
1020 __kick_linger_request(req);
1021 }
1022
1023 list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
1024 __kick_linger_request(req);
1025} 1001}
1026 1002
1027/* 1003static void osd_cleanup(struct ceph_osd *osd)
1028 * If the osd connection drops, we need to resubmit all requests.
1029 */
1030static void osd_reset(struct ceph_connection *con)
1031{ 1004{
1032 struct ceph_osd *osd = con->private; 1005 WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
1033 struct ceph_osd_client *osdc; 1006 WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
1034 1007 WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
1035 if (!osd) 1008 WARN_ON(!list_empty(&osd->o_osd_lru));
1036 return; 1009 WARN_ON(!list_empty(&osd->o_keepalive_item));
1037 dout("osd_reset osd%d\n", osd->o_osd); 1010
1038 osdc = osd->o_osdc; 1011 if (osd->o_auth.authorizer) {
1039 down_read(&osdc->map_sem); 1012 WARN_ON(osd_homeless(osd));
1040 mutex_lock(&osdc->request_mutex); 1013 ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
1041 __kick_osd_requests(osdc, osd); 1014 }
1042 __send_queued(osdc);
1043 mutex_unlock(&osdc->request_mutex);
1044 up_read(&osdc->map_sem);
1045} 1015}
1046 1016
1047/* 1017/*
@@ -1051,22 +1021,15 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
1051{ 1021{
1052 struct ceph_osd *osd; 1022 struct ceph_osd *osd;
1053 1023
1054 osd = kzalloc(sizeof(*osd), GFP_NOFS); 1024 WARN_ON(onum == CEPH_HOMELESS_OSD);
1055 if (!osd)
1056 return NULL;
1057 1025
1058 atomic_set(&osd->o_ref, 1); 1026 osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL);
1027 osd_init(osd);
1059 osd->o_osdc = osdc; 1028 osd->o_osdc = osdc;
1060 osd->o_osd = onum; 1029 osd->o_osd = onum;
1061 RB_CLEAR_NODE(&osd->o_node);
1062 INIT_LIST_HEAD(&osd->o_requests);
1063 INIT_LIST_HEAD(&osd->o_linger_requests);
1064 INIT_LIST_HEAD(&osd->o_osd_lru);
1065 osd->o_incarnation = 1;
1066 1030
1067 ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); 1031 ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
1068 1032
1069 INIT_LIST_HEAD(&osd->o_keepalive_item);
1070 return osd; 1033 return osd;
1071} 1034}
1072 1035
@@ -1087,114 +1050,115 @@ static void put_osd(struct ceph_osd *osd)
1087 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), 1050 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
1088 atomic_read(&osd->o_ref) - 1); 1051 atomic_read(&osd->o_ref) - 1);
1089 if (atomic_dec_and_test(&osd->o_ref)) { 1052 if (atomic_dec_and_test(&osd->o_ref)) {
1090 if (osd->o_auth.authorizer) 1053 osd_cleanup(osd);
1091 ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
1092 kfree(osd); 1054 kfree(osd);
1093 } 1055 }
1094} 1056}
1095 1057
1096/* 1058DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
1097 * remove an osd from our map
1098 */
1099static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
1100{
1101 dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
1102 WARN_ON(!list_empty(&osd->o_requests));
1103 WARN_ON(!list_empty(&osd->o_linger_requests));
1104 1059
1105 list_del_init(&osd->o_osd_lru); 1060static void __move_osd_to_lru(struct ceph_osd *osd)
1106 rb_erase(&osd->o_node, &osdc->osds);
1107 RB_CLEAR_NODE(&osd->o_node);
1108}
1109
1110static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
1111{ 1061{
1112 dout("%s %p osd%d\n", __func__, osd, osd->o_osd); 1062 struct ceph_osd_client *osdc = osd->o_osdc;
1113
1114 if (!RB_EMPTY_NODE(&osd->o_node)) {
1115 ceph_con_close(&osd->o_con);
1116 __remove_osd(osdc, osd);
1117 put_osd(osd);
1118 }
1119}
1120
1121static void remove_all_osds(struct ceph_osd_client *osdc)
1122{
1123 dout("%s %p\n", __func__, osdc);
1124 mutex_lock(&osdc->request_mutex);
1125 while (!RB_EMPTY_ROOT(&osdc->osds)) {
1126 struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
1127 struct ceph_osd, o_node);
1128 remove_osd(osdc, osd);
1129 }
1130 mutex_unlock(&osdc->request_mutex);
1131}
1132 1063
1133static void __move_osd_to_lru(struct ceph_osd_client *osdc, 1064 dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
1134 struct ceph_osd *osd)
1135{
1136 dout("%s %p\n", __func__, osd);
1137 BUG_ON(!list_empty(&osd->o_osd_lru)); 1065 BUG_ON(!list_empty(&osd->o_osd_lru));
1138 1066
1067 spin_lock(&osdc->osd_lru_lock);
1139 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); 1068 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
1069 spin_unlock(&osdc->osd_lru_lock);
1070
1140 osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl; 1071 osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
1141} 1072}
1142 1073
1143static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc, 1074static void maybe_move_osd_to_lru(struct ceph_osd *osd)
1144 struct ceph_osd *osd)
1145{ 1075{
1146 dout("%s %p\n", __func__, osd); 1076 if (RB_EMPTY_ROOT(&osd->o_requests) &&
1147 1077 RB_EMPTY_ROOT(&osd->o_linger_requests))
1148 if (list_empty(&osd->o_requests) && 1078 __move_osd_to_lru(osd);
1149 list_empty(&osd->o_linger_requests))
1150 __move_osd_to_lru(osdc, osd);
1151} 1079}
1152 1080
1153static void __remove_osd_from_lru(struct ceph_osd *osd) 1081static void __remove_osd_from_lru(struct ceph_osd *osd)
1154{ 1082{
1155 dout("__remove_osd_from_lru %p\n", osd); 1083 struct ceph_osd_client *osdc = osd->o_osdc;
1084
1085 dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
1086
1087 spin_lock(&osdc->osd_lru_lock);
1156 if (!list_empty(&osd->o_osd_lru)) 1088 if (!list_empty(&osd->o_osd_lru))
1157 list_del_init(&osd->o_osd_lru); 1089 list_del_init(&osd->o_osd_lru);
1090 spin_unlock(&osdc->osd_lru_lock);
1158} 1091}
1159 1092
1160static void remove_old_osds(struct ceph_osd_client *osdc) 1093/*
1094 * Close the connection and assign any leftover requests to the
1095 * homeless session.
1096 */
1097static void close_osd(struct ceph_osd *osd)
1161{ 1098{
1162 struct ceph_osd *osd, *nosd; 1099 struct ceph_osd_client *osdc = osd->o_osdc;
1100 struct rb_node *n;
1163 1101
1164 dout("__remove_old_osds %p\n", osdc); 1102 verify_osdc_wrlocked(osdc);
1165 mutex_lock(&osdc->request_mutex); 1103 dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
1166 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { 1104
1167 if (time_before(jiffies, osd->lru_ttl)) 1105 ceph_con_close(&osd->o_con);
1168 break; 1106
1169 remove_osd(osdc, osd); 1107 for (n = rb_first(&osd->o_requests); n; ) {
1108 struct ceph_osd_request *req =
1109 rb_entry(n, struct ceph_osd_request, r_node);
1110
1111 n = rb_next(n); /* unlink_request() */
1112
1113 dout(" reassigning req %p tid %llu\n", req, req->r_tid);
1114 unlink_request(osd, req);
1115 link_request(&osdc->homeless_osd, req);
1116 }
1117 for (n = rb_first(&osd->o_linger_requests); n; ) {
1118 struct ceph_osd_linger_request *lreq =
1119 rb_entry(n, struct ceph_osd_linger_request, node);
1120
1121 n = rb_next(n); /* unlink_linger() */
1122
1123 dout(" reassigning lreq %p linger_id %llu\n", lreq,
1124 lreq->linger_id);
1125 unlink_linger(osd, lreq);
1126 link_linger(&osdc->homeless_osd, lreq);
1170 } 1127 }
1171 mutex_unlock(&osdc->request_mutex); 1128
1129 __remove_osd_from_lru(osd);
1130 erase_osd(&osdc->osds, osd);
1131 put_osd(osd);
1172} 1132}
1173 1133
1174/* 1134/*
1175 * reset osd connect 1135 * reset osd connect
1176 */ 1136 */
1177static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) 1137static int reopen_osd(struct ceph_osd *osd)
1178{ 1138{
1179 struct ceph_entity_addr *peer_addr; 1139 struct ceph_entity_addr *peer_addr;
1180 1140
1181 dout("__reset_osd %p osd%d\n", osd, osd->o_osd); 1141 dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
1182 if (list_empty(&osd->o_requests) && 1142
1183 list_empty(&osd->o_linger_requests)) { 1143 if (RB_EMPTY_ROOT(&osd->o_requests) &&
1184 remove_osd(osdc, osd); 1144 RB_EMPTY_ROOT(&osd->o_linger_requests)) {
1145 close_osd(osd);
1185 return -ENODEV; 1146 return -ENODEV;
1186 } 1147 }
1187 1148
1188 peer_addr = &osdc->osdmap->osd_addr[osd->o_osd]; 1149 peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
1189 if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && 1150 if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
1190 !ceph_con_opened(&osd->o_con)) { 1151 !ceph_con_opened(&osd->o_con)) {
1191 struct ceph_osd_request *req; 1152 struct rb_node *n;
1192 1153
1193 dout("osd addr hasn't changed and connection never opened, " 1154 dout("osd addr hasn't changed and connection never opened, "
1194 "letting msgr retry\n"); 1155 "letting msgr retry\n");
1195 /* touch each r_stamp for handle_timeout()'s benfit */ 1156 /* touch each r_stamp for handle_timeout()'s benfit */
1196 list_for_each_entry(req, &osd->o_requests, r_osd_item) 1157 for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
1158 struct ceph_osd_request *req =
1159 rb_entry(n, struct ceph_osd_request, r_node);
1197 req->r_stamp = jiffies; 1160 req->r_stamp = jiffies;
1161 }
1198 1162
1199 return -EAGAIN; 1163 return -EAGAIN;
1200 } 1164 }
@@ -1206,455 +1170,1370 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
1206 return 0; 1170 return 0;
1207} 1171}
1208 1172
1209static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) 1173static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
1174 bool wrlocked)
1210{ 1175{
1211 struct rb_node **p = &osdc->osds.rb_node; 1176 struct ceph_osd *osd;
1212 struct rb_node *parent = NULL;
1213 struct ceph_osd *osd = NULL;
1214 1177
1215 dout("__insert_osd %p osd%d\n", new, new->o_osd); 1178 if (wrlocked)
1216 while (*p) { 1179 verify_osdc_wrlocked(osdc);
1217 parent = *p; 1180 else
1218 osd = rb_entry(parent, struct ceph_osd, o_node); 1181 verify_osdc_locked(osdc);
1219 if (new->o_osd < osd->o_osd) 1182
1220 p = &(*p)->rb_left; 1183 if (o != CEPH_HOMELESS_OSD)
1221 else if (new->o_osd > osd->o_osd) 1184 osd = lookup_osd(&osdc->osds, o);
1222 p = &(*p)->rb_right; 1185 else
1223 else 1186 osd = &osdc->homeless_osd;
1224 BUG(); 1187 if (!osd) {
1188 if (!wrlocked)
1189 return ERR_PTR(-EAGAIN);
1190
1191 osd = create_osd(osdc, o);
1192 insert_osd(&osdc->osds, osd);
1193 ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
1194 &osdc->osdmap->osd_addr[osd->o_osd]);
1225 } 1195 }
1226 1196
1227 rb_link_node(&new->o_node, parent, p); 1197 dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
1228 rb_insert_color(&new->o_node, &osdc->osds); 1198 return osd;
1229} 1199}
1230 1200
1231static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) 1201/*
1202 * Create request <-> OSD session relation.
1203 *
1204 * @req has to be assigned a tid, @osd may be homeless.
1205 */
1206static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
1232{ 1207{
1233 struct ceph_osd *osd; 1208 verify_osd_locked(osd);
1234 struct rb_node *n = osdc->osds.rb_node; 1209 WARN_ON(!req->r_tid || req->r_osd);
1235 1210 dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
1236 while (n) { 1211 req, req->r_tid);
1237 osd = rb_entry(n, struct ceph_osd, o_node); 1212
1238 if (o < osd->o_osd) 1213 if (!osd_homeless(osd))
1239 n = n->rb_left; 1214 __remove_osd_from_lru(osd);
1240 else if (o > osd->o_osd) 1215 else
1241 n = n->rb_right; 1216 atomic_inc(&osd->o_osdc->num_homeless);
1242 else 1217
1243 return osd; 1218 get_osd(osd);
1244 } 1219 insert_request(&osd->o_requests, req);
1245 return NULL; 1220 req->r_osd = osd;
1246} 1221}
1247 1222
1248static void __schedule_osd_timeout(struct ceph_osd_client *osdc) 1223static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
1249{ 1224{
1250 schedule_delayed_work(&osdc->timeout_work, 1225 verify_osd_locked(osd);
1251 osdc->client->options->osd_keepalive_timeout); 1226 WARN_ON(req->r_osd != osd);
1227 dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
1228 req, req->r_tid);
1229
1230 req->r_osd = NULL;
1231 erase_request(&osd->o_requests, req);
1232 put_osd(osd);
1233
1234 if (!osd_homeless(osd))
1235 maybe_move_osd_to_lru(osd);
1236 else
1237 atomic_dec(&osd->o_osdc->num_homeless);
1252} 1238}
1253 1239
1254static void __cancel_osd_timeout(struct ceph_osd_client *osdc) 1240static bool __pool_full(struct ceph_pg_pool_info *pi)
1255{ 1241{
1256 cancel_delayed_work(&osdc->timeout_work); 1242 return pi->flags & CEPH_POOL_FLAG_FULL;
1257} 1243}
1258 1244
1259/* 1245static bool have_pool_full(struct ceph_osd_client *osdc)
1260 * Register request, assign tid. If this is the first request, set up
1261 * the timeout event.
1262 */
1263static void __register_request(struct ceph_osd_client *osdc,
1264 struct ceph_osd_request *req)
1265{ 1246{
1266 req->r_tid = ++osdc->last_tid; 1247 struct rb_node *n;
1267 req->r_request->hdr.tid = cpu_to_le64(req->r_tid); 1248
1268 dout("__register_request %p tid %lld\n", req, req->r_tid); 1249 for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
1269 __insert_request(osdc, req); 1250 struct ceph_pg_pool_info *pi =
1270 ceph_osdc_get_request(req); 1251 rb_entry(n, struct ceph_pg_pool_info, node);
1271 osdc->num_requests++; 1252
1272 if (osdc->num_requests == 1) { 1253 if (__pool_full(pi))
1273 dout(" first request, scheduling timeout\n"); 1254 return true;
1274 __schedule_osd_timeout(osdc);
1275 } 1255 }
1256
1257 return false;
1258}
1259
1260static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
1261{
1262 struct ceph_pg_pool_info *pi;
1263
1264 pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
1265 if (!pi)
1266 return false;
1267
1268 return __pool_full(pi);
1276} 1269}
1277 1270
1278/* 1271/*
1279 * called under osdc->request_mutex 1272 * Returns whether a request should be blocked from being sent
1273 * based on the current osdmap and osd_client settings.
1280 */ 1274 */
1281static void __unregister_request(struct ceph_osd_client *osdc, 1275static bool target_should_be_paused(struct ceph_osd_client *osdc,
1282 struct ceph_osd_request *req) 1276 const struct ceph_osd_request_target *t,
1277 struct ceph_pg_pool_info *pi)
1283{ 1278{
1284 if (RB_EMPTY_NODE(&req->r_node)) { 1279 bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
1285 dout("__unregister_request %p tid %lld not registered\n", 1280 bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
1286 req, req->r_tid); 1281 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
1287 return; 1282 __pool_full(pi);
1283
1284 WARN_ON(pi->id != t->base_oloc.pool);
1285 return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
1286 (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
1287}
1288
1289enum calc_target_result {
1290 CALC_TARGET_NO_ACTION = 0,
1291 CALC_TARGET_NEED_RESEND,
1292 CALC_TARGET_POOL_DNE,
1293};
1294
1295static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
1296 struct ceph_osd_request_target *t,
1297 u32 *last_force_resend,
1298 bool any_change)
1299{
1300 struct ceph_pg_pool_info *pi;
1301 struct ceph_pg pgid, last_pgid;
1302 struct ceph_osds up, acting;
1303 bool force_resend = false;
1304 bool need_check_tiering = false;
1305 bool need_resend = false;
1306 bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap,
1307 CEPH_OSDMAP_SORTBITWISE);
1308 enum calc_target_result ct_res;
1309 int ret;
1310
1311 pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
1312 if (!pi) {
1313 t->osd = CEPH_HOMELESS_OSD;
1314 ct_res = CALC_TARGET_POOL_DNE;
1315 goto out;
1288 } 1316 }
1289 1317
1290 dout("__unregister_request %p tid %lld\n", req, req->r_tid); 1318 if (osdc->osdmap->epoch == pi->last_force_request_resend) {
1291 rb_erase(&req->r_node, &osdc->requests); 1319 if (last_force_resend &&
1292 RB_CLEAR_NODE(&req->r_node); 1320 *last_force_resend < pi->last_force_request_resend) {
1293 osdc->num_requests--; 1321 *last_force_resend = pi->last_force_request_resend;
1322 force_resend = true;
1323 } else if (!last_force_resend) {
1324 force_resend = true;
1325 }
1326 }
1327 if (ceph_oid_empty(&t->target_oid) || force_resend) {
1328 ceph_oid_copy(&t->target_oid, &t->base_oid);
1329 need_check_tiering = true;
1330 }
1331 if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
1332 ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
1333 need_check_tiering = true;
1334 }
1294 1335
1295 if (req->r_osd) { 1336 if (need_check_tiering &&
1296 /* make sure the original request isn't in flight. */ 1337 (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
1297 ceph_msg_revoke(req->r_request); 1338 if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
1339 t->target_oloc.pool = pi->read_tier;
1340 if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
1341 t->target_oloc.pool = pi->write_tier;
1342 }
1298 1343
1299 list_del_init(&req->r_osd_item); 1344 ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
1300 maybe_move_osd_to_lru(osdc, req->r_osd); 1345 &t->target_oloc, &pgid);
1301 if (list_empty(&req->r_linger_osd_item)) 1346 if (ret) {
1302 req->r_osd = NULL; 1347 WARN_ON(ret != -ENOENT);
1348 t->osd = CEPH_HOMELESS_OSD;
1349 ct_res = CALC_TARGET_POOL_DNE;
1350 goto out;
1351 }
1352 last_pgid.pool = pgid.pool;
1353 last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
1354
1355 ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
1356 if (any_change &&
1357 ceph_is_new_interval(&t->acting,
1358 &acting,
1359 &t->up,
1360 &up,
1361 t->size,
1362 pi->size,
1363 t->min_size,
1364 pi->min_size,
1365 t->pg_num,
1366 pi->pg_num,
1367 t->sort_bitwise,
1368 sort_bitwise,
1369 &last_pgid))
1370 force_resend = true;
1371
1372 if (t->paused && !target_should_be_paused(osdc, t, pi)) {
1373 t->paused = false;
1374 need_resend = true;
1303 } 1375 }
1304 1376
1305 list_del_init(&req->r_req_lru_item); 1377 if (ceph_pg_compare(&t->pgid, &pgid) ||
1306 ceph_osdc_put_request(req); 1378 ceph_osds_changed(&t->acting, &acting, any_change) ||
1379 force_resend) {
1380 t->pgid = pgid; /* struct */
1381 ceph_osds_copy(&t->acting, &acting);
1382 ceph_osds_copy(&t->up, &up);
1383 t->size = pi->size;
1384 t->min_size = pi->min_size;
1385 t->pg_num = pi->pg_num;
1386 t->pg_num_mask = pi->pg_num_mask;
1387 t->sort_bitwise = sort_bitwise;
1388
1389 t->osd = acting.primary;
1390 need_resend = true;
1391 }
1392
1393 ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
1394out:
1395 dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
1396 return ct_res;
1397}
1398
1399static void setup_request_data(struct ceph_osd_request *req,
1400 struct ceph_msg *msg)
1401{
1402 u32 data_len = 0;
1403 int i;
1404
1405 if (!list_empty(&msg->data))
1406 return;
1407
1408 WARN_ON(msg->data_length);
1409 for (i = 0; i < req->r_num_ops; i++) {
1410 struct ceph_osd_req_op *op = &req->r_ops[i];
1411
1412 switch (op->op) {
1413 /* request */
1414 case CEPH_OSD_OP_WRITE:
1415 case CEPH_OSD_OP_WRITEFULL:
1416 WARN_ON(op->indata_len != op->extent.length);
1417 ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
1418 break;
1419 case CEPH_OSD_OP_SETXATTR:
1420 case CEPH_OSD_OP_CMPXATTR:
1421 WARN_ON(op->indata_len != op->xattr.name_len +
1422 op->xattr.value_len);
1423 ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
1424 break;
1425 case CEPH_OSD_OP_NOTIFY_ACK:
1426 ceph_osdc_msg_data_add(msg,
1427 &op->notify_ack.request_data);
1428 break;
1429
1430 /* reply */
1431 case CEPH_OSD_OP_STAT:
1432 ceph_osdc_msg_data_add(req->r_reply,
1433 &op->raw_data_in);
1434 break;
1435 case CEPH_OSD_OP_READ:
1436 ceph_osdc_msg_data_add(req->r_reply,
1437 &op->extent.osd_data);
1438 break;
1439
1440 /* both */
1441 case CEPH_OSD_OP_CALL:
1442 WARN_ON(op->indata_len != op->cls.class_len +
1443 op->cls.method_len +
1444 op->cls.indata_len);
1445 ceph_osdc_msg_data_add(msg, &op->cls.request_info);
1446 /* optional, can be NONE */
1447 ceph_osdc_msg_data_add(msg, &op->cls.request_data);
1448 /* optional, can be NONE */
1449 ceph_osdc_msg_data_add(req->r_reply,
1450 &op->cls.response_data);
1451 break;
1452 case CEPH_OSD_OP_NOTIFY:
1453 ceph_osdc_msg_data_add(msg,
1454 &op->notify.request_data);
1455 ceph_osdc_msg_data_add(req->r_reply,
1456 &op->notify.response_data);
1457 break;
1458 }
1459
1460 data_len += op->indata_len;
1461 }
1462
1463 WARN_ON(data_len != msg->data_length);
1464}
1465
1466static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
1467{
1468 void *p = msg->front.iov_base;
1469 void *const end = p + msg->front_alloc_len;
1470 u32 data_len = 0;
1471 int i;
1472
1473 if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
1474 /* snapshots aren't writeable */
1475 WARN_ON(req->r_snapid != CEPH_NOSNAP);
1476 } else {
1477 WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
1478 req->r_data_offset || req->r_snapc);
1479 }
1480
1481 setup_request_data(req, msg);
1482
1483 ceph_encode_32(&p, 1); /* client_inc, always 1 */
1484 ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
1485 ceph_encode_32(&p, req->r_flags);
1486 ceph_encode_timespec(p, &req->r_mtime);
1487 p += sizeof(struct ceph_timespec);
1488 /* aka reassert_version */
1489 memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
1490 p += sizeof(req->r_replay_version);
1491
1492 /* oloc */
1493 ceph_encode_8(&p, 4);
1494 ceph_encode_8(&p, 4);
1495 ceph_encode_32(&p, 8 + 4 + 4);
1496 ceph_encode_64(&p, req->r_t.target_oloc.pool);
1497 ceph_encode_32(&p, -1); /* preferred */
1498 ceph_encode_32(&p, 0); /* key len */
1499
1500 /* pgid */
1501 ceph_encode_8(&p, 1);
1502 ceph_encode_64(&p, req->r_t.pgid.pool);
1503 ceph_encode_32(&p, req->r_t.pgid.seed);
1504 ceph_encode_32(&p, -1); /* preferred */
1505
1506 /* oid */
1507 ceph_encode_32(&p, req->r_t.target_oid.name_len);
1508 memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
1509 p += req->r_t.target_oid.name_len;
1307 1510
1308 if (osdc->num_requests == 0) { 1511 /* ops, can imply data */
1309 dout(" no requests, canceling timeout\n"); 1512 ceph_encode_16(&p, req->r_num_ops);
1310 __cancel_osd_timeout(osdc); 1513 for (i = 0; i < req->r_num_ops; i++) {
1514 data_len += osd_req_encode_op(p, &req->r_ops[i]);
1515 p += sizeof(struct ceph_osd_op);
1311 } 1516 }
1517
1518 ceph_encode_64(&p, req->r_snapid); /* snapid */
1519 if (req->r_snapc) {
1520 ceph_encode_64(&p, req->r_snapc->seq);
1521 ceph_encode_32(&p, req->r_snapc->num_snaps);
1522 for (i = 0; i < req->r_snapc->num_snaps; i++)
1523 ceph_encode_64(&p, req->r_snapc->snaps[i]);
1524 } else {
1525 ceph_encode_64(&p, 0); /* snap_seq */
1526 ceph_encode_32(&p, 0); /* snaps len */
1527 }
1528
1529 ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
1530
1531 BUG_ON(p > end);
1532 msg->front.iov_len = p - msg->front.iov_base;
1533 msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
1534 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1535 msg->hdr.data_len = cpu_to_le32(data_len);
1536 /*
1537 * The header "data_off" is a hint to the receiver allowing it
1538 * to align received data into its buffers such that there's no
1539 * need to re-copy it before writing it to disk (direct I/O).
1540 */
1541 msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
1542
1543 dout("%s req %p oid %*pE oid_len %d front %zu data %u\n", __func__,
1544 req, req->r_t.target_oid.name_len, req->r_t.target_oid.name,
1545 req->r_t.target_oid.name_len, msg->front.iov_len, data_len);
1312} 1546}
1313 1547
1314/* 1548/*
1315 * Cancel a previously queued request message 1549 * @req has to be assigned a tid and registered.
1316 */ 1550 */
1317static void __cancel_request(struct ceph_osd_request *req) 1551static void send_request(struct ceph_osd_request *req)
1318{ 1552{
1319 if (req->r_sent && req->r_osd) { 1553 struct ceph_osd *osd = req->r_osd;
1554
1555 verify_osd_locked(osd);
1556 WARN_ON(osd->o_osd != req->r_t.osd);
1557
1558 /*
1559 * We may have a previously queued request message hanging
1560 * around. Cancel it to avoid corrupting the msgr.
1561 */
1562 if (req->r_sent)
1320 ceph_msg_revoke(req->r_request); 1563 ceph_msg_revoke(req->r_request);
1321 req->r_sent = 0; 1564
1565 req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
1566 if (req->r_attempts)
1567 req->r_flags |= CEPH_OSD_FLAG_RETRY;
1568 else
1569 WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
1570
1571 encode_request(req, req->r_request);
1572
1573 dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
1574 __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
1575 req->r_t.osd, req->r_flags, req->r_attempts);
1576
1577 req->r_t.paused = false;
1578 req->r_stamp = jiffies;
1579 req->r_attempts++;
1580
1581 req->r_sent = osd->o_incarnation;
1582 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
1583 ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
1584}
1585
1586static void maybe_request_map(struct ceph_osd_client *osdc)
1587{
1588 bool continuous = false;
1589
1590 verify_osdc_locked(osdc);
1591 WARN_ON(!osdc->osdmap->epoch);
1592
1593 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
1594 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
1595 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
1596 dout("%s osdc %p continuous\n", __func__, osdc);
1597 continuous = true;
1598 } else {
1599 dout("%s osdc %p onetime\n", __func__, osdc);
1322 } 1600 }
1601
1602 if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
1603 osdc->osdmap->epoch + 1, continuous))
1604 ceph_monc_renew_subs(&osdc->client->monc);
1323} 1605}
1324 1606
1325static void __register_linger_request(struct ceph_osd_client *osdc, 1607static void send_map_check(struct ceph_osd_request *req);
1326 struct ceph_osd_request *req) 1608
1609static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
1327{ 1610{
1328 dout("%s %p tid %llu\n", __func__, req, req->r_tid); 1611 struct ceph_osd_client *osdc = req->r_osdc;
1329 WARN_ON(!req->r_linger); 1612 struct ceph_osd *osd;
1613 enum calc_target_result ct_res;
1614 bool need_send = false;
1615 bool promoted = false;
1616
1617 WARN_ON(req->r_tid || req->r_got_reply);
1618 dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
1619
1620again:
1621 ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
1622 if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
1623 goto promote;
1624
1625 osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
1626 if (IS_ERR(osd)) {
1627 WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
1628 goto promote;
1629 }
1630
1631 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
1632 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
1633 dout("req %p pausewr\n", req);
1634 req->r_t.paused = true;
1635 maybe_request_map(osdc);
1636 } else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
1637 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
1638 dout("req %p pauserd\n", req);
1639 req->r_t.paused = true;
1640 maybe_request_map(osdc);
1641 } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
1642 !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
1643 CEPH_OSD_FLAG_FULL_FORCE)) &&
1644 (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
1645 pool_full(osdc, req->r_t.base_oloc.pool))) {
1646 dout("req %p full/pool_full\n", req);
1647 pr_warn_ratelimited("FULL or reached pool quota\n");
1648 req->r_t.paused = true;
1649 maybe_request_map(osdc);
1650 } else if (!osd_homeless(osd)) {
1651 need_send = true;
1652 } else {
1653 maybe_request_map(osdc);
1654 }
1655
1656 mutex_lock(&osd->lock);
1657 /*
1658 * Assign the tid atomically with send_request() to protect
1659 * multiple writes to the same object from racing with each
1660 * other, resulting in out of order ops on the OSDs.
1661 */
1662 req->r_tid = atomic64_inc_return(&osdc->last_tid);
1663 link_request(osd, req);
1664 if (need_send)
1665 send_request(req);
1666 mutex_unlock(&osd->lock);
1330 1667
1668 if (ct_res == CALC_TARGET_POOL_DNE)
1669 send_map_check(req);
1670
1671 if (promoted)
1672 downgrade_write(&osdc->lock);
1673 return;
1674
1675promote:
1676 up_read(&osdc->lock);
1677 down_write(&osdc->lock);
1678 wrlocked = true;
1679 promoted = true;
1680 goto again;
1681}
1682
1683static void account_request(struct ceph_osd_request *req)
1684{
1685 unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
1686
1687 if (req->r_flags & CEPH_OSD_FLAG_READ) {
1688 WARN_ON(req->r_flags & mask);
1689 req->r_flags |= CEPH_OSD_FLAG_ACK;
1690 } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
1691 WARN_ON(!(req->r_flags & mask));
1692 else
1693 WARN_ON(1);
1694
1695 WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
1696 atomic_inc(&req->r_osdc->num_requests);
1697}
1698
1699static void submit_request(struct ceph_osd_request *req, bool wrlocked)
1700{
1331 ceph_osdc_get_request(req); 1701 ceph_osdc_get_request(req);
1332 list_add_tail(&req->r_linger_item, &osdc->req_linger); 1702 account_request(req);
1333 if (req->r_osd) 1703 __submit_request(req, wrlocked);
1334 list_add_tail(&req->r_linger_osd_item,
1335 &req->r_osd->o_linger_requests);
1336} 1704}
1337 1705
1338static void __unregister_linger_request(struct ceph_osd_client *osdc, 1706static void __finish_request(struct ceph_osd_request *req)
1339 struct ceph_osd_request *req)
1340{ 1707{
1341 WARN_ON(!req->r_linger); 1708 struct ceph_osd_client *osdc = req->r_osdc;
1709 struct ceph_osd *osd = req->r_osd;
1342 1710
1343 if (list_empty(&req->r_linger_item)) { 1711 verify_osd_locked(osd);
1344 dout("%s %p tid %llu not registered\n", __func__, req, 1712 dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
1345 req->r_tid); 1713
1714 WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
1715 unlink_request(osd, req);
1716 atomic_dec(&osdc->num_requests);
1717
1718 /*
1719 * If an OSD has failed or returned and a request has been sent
1720 * twice, it's possible to get a reply and end up here while the
1721 * request message is queued for delivery. We will ignore the
1722 * reply, so not a big deal, but better to try and catch it.
1723 */
1724 ceph_msg_revoke(req->r_request);
1725 ceph_msg_revoke_incoming(req->r_reply);
1726}
1727
1728static void finish_request(struct ceph_osd_request *req)
1729{
1730 __finish_request(req);
1731 ceph_osdc_put_request(req);
1732}
1733
1734static void __complete_request(struct ceph_osd_request *req)
1735{
1736 if (req->r_callback)
1737 req->r_callback(req);
1738 else
1739 complete_all(&req->r_completion);
1740}
1741
1742/*
1743 * Note that this is open-coded in handle_reply(), which has to deal
1744 * with ack vs commit, dup acks, etc.
1745 */
1746static void complete_request(struct ceph_osd_request *req, int err)
1747{
1748 dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
1749
1750 req->r_result = err;
1751 __finish_request(req);
1752 __complete_request(req);
1753 complete_all(&req->r_safe_completion);
1754 ceph_osdc_put_request(req);
1755}
1756
1757static void cancel_map_check(struct ceph_osd_request *req)
1758{
1759 struct ceph_osd_client *osdc = req->r_osdc;
1760 struct ceph_osd_request *lookup_req;
1761
1762 verify_osdc_wrlocked(osdc);
1763
1764 lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
1765 if (!lookup_req)
1346 return; 1766 return;
1767
1768 WARN_ON(lookup_req != req);
1769 erase_request_mc(&osdc->map_checks, req);
1770 ceph_osdc_put_request(req);
1771}
1772
1773static void cancel_request(struct ceph_osd_request *req)
1774{
1775 dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
1776
1777 cancel_map_check(req);
1778 finish_request(req);
1779}
1780
1781static void check_pool_dne(struct ceph_osd_request *req)
1782{
1783 struct ceph_osd_client *osdc = req->r_osdc;
1784 struct ceph_osdmap *map = osdc->osdmap;
1785
1786 verify_osdc_wrlocked(osdc);
1787 WARN_ON(!map->epoch);
1788
1789 if (req->r_attempts) {
1790 /*
1791 * We sent a request earlier, which means that
1792 * previously the pool existed, and now it does not
1793 * (i.e., it was deleted).
1794 */
1795 req->r_map_dne_bound = map->epoch;
1796 dout("%s req %p tid %llu pool disappeared\n", __func__, req,
1797 req->r_tid);
1798 } else {
1799 dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
1800 req, req->r_tid, req->r_map_dne_bound, map->epoch);
1347 } 1801 }
1348 1802
1349 dout("%s %p tid %llu\n", __func__, req, req->r_tid); 1803 if (req->r_map_dne_bound) {
1350 list_del_init(&req->r_linger_item); 1804 if (map->epoch >= req->r_map_dne_bound) {
1805 /* we had a new enough map */
1806 pr_info_ratelimited("tid %llu pool does not exist\n",
1807 req->r_tid);
1808 complete_request(req, -ENOENT);
1809 }
1810 } else {
1811 send_map_check(req);
1812 }
1813}
1351 1814
1352 if (req->r_osd) { 1815static void map_check_cb(struct ceph_mon_generic_request *greq)
1353 list_del_init(&req->r_linger_osd_item); 1816{
1354 maybe_move_osd_to_lru(osdc, req->r_osd); 1817 struct ceph_osd_client *osdc = &greq->monc->client->osdc;
1355 if (list_empty(&req->r_osd_item)) 1818 struct ceph_osd_request *req;
1356 req->r_osd = NULL; 1819 u64 tid = greq->private_data;
1820
1821 WARN_ON(greq->result || !greq->u.newest);
1822
1823 down_write(&osdc->lock);
1824 req = lookup_request_mc(&osdc->map_checks, tid);
1825 if (!req) {
1826 dout("%s tid %llu dne\n", __func__, tid);
1827 goto out_unlock;
1357 } 1828 }
1829
1830 dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
1831 req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
1832 if (!req->r_map_dne_bound)
1833 req->r_map_dne_bound = greq->u.newest;
1834 erase_request_mc(&osdc->map_checks, req);
1835 check_pool_dne(req);
1836
1358 ceph_osdc_put_request(req); 1837 ceph_osdc_put_request(req);
1838out_unlock:
1839 up_write(&osdc->lock);
1359} 1840}
1360 1841
1361void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, 1842static void send_map_check(struct ceph_osd_request *req)
1362 struct ceph_osd_request *req)
1363{ 1843{
1364 if (!req->r_linger) { 1844 struct ceph_osd_client *osdc = req->r_osdc;
1365 dout("set_request_linger %p\n", req); 1845 struct ceph_osd_request *lookup_req;
1366 req->r_linger = 1; 1846 int ret;
1847
1848 verify_osdc_wrlocked(osdc);
1849
1850 lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
1851 if (lookup_req) {
1852 WARN_ON(lookup_req != req);
1853 return;
1367 } 1854 }
1855
1856 ceph_osdc_get_request(req);
1857 insert_request_mc(&osdc->map_checks, req);
1858 ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
1859 map_check_cb, req->r_tid);
1860 WARN_ON(ret);
1368} 1861}
1369EXPORT_SYMBOL(ceph_osdc_set_request_linger);
1370 1862
1371/* 1863/*
1372 * Returns whether a request should be blocked from being sent 1864 * lingering requests, watch/notify v2 infrastructure
1373 * based on the current osdmap and osd_client settings.
1374 *
1375 * Caller should hold map_sem for read.
1376 */ 1865 */
1377static bool __req_should_be_paused(struct ceph_osd_client *osdc, 1866static void linger_release(struct kref *kref)
1378 struct ceph_osd_request *req)
1379{ 1867{
1380 bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); 1868 struct ceph_osd_linger_request *lreq =
1381 bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || 1869 container_of(kref, struct ceph_osd_linger_request, kref);
1382 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); 1870
1383 return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) || 1871 dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
1384 (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); 1872 lreq->reg_req, lreq->ping_req);
1873 WARN_ON(!RB_EMPTY_NODE(&lreq->node));
1874 WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
1875 WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
1876 WARN_ON(!list_empty(&lreq->scan_item));
1877 WARN_ON(!list_empty(&lreq->pending_lworks));
1878 WARN_ON(lreq->osd);
1879
1880 if (lreq->reg_req)
1881 ceph_osdc_put_request(lreq->reg_req);
1882 if (lreq->ping_req)
1883 ceph_osdc_put_request(lreq->ping_req);
1884 target_destroy(&lreq->t);
1885 kfree(lreq);
1385} 1886}
1386 1887
1888static void linger_put(struct ceph_osd_linger_request *lreq)
1889{
1890 if (lreq)
1891 kref_put(&lreq->kref, linger_release);
1892}
1893
1894static struct ceph_osd_linger_request *
1895linger_get(struct ceph_osd_linger_request *lreq)
1896{
1897 kref_get(&lreq->kref);
1898 return lreq;
1899}
1900
1901static struct ceph_osd_linger_request *
1902linger_alloc(struct ceph_osd_client *osdc)
1903{
1904 struct ceph_osd_linger_request *lreq;
1905
1906 lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
1907 if (!lreq)
1908 return NULL;
1909
1910 kref_init(&lreq->kref);
1911 mutex_init(&lreq->lock);
1912 RB_CLEAR_NODE(&lreq->node);
1913 RB_CLEAR_NODE(&lreq->osdc_node);
1914 RB_CLEAR_NODE(&lreq->mc_node);
1915 INIT_LIST_HEAD(&lreq->scan_item);
1916 INIT_LIST_HEAD(&lreq->pending_lworks);
1917 init_completion(&lreq->reg_commit_wait);
1918 init_completion(&lreq->notify_finish_wait);
1919
1920 lreq->osdc = osdc;
1921 target_init(&lreq->t);
1922
1923 dout("%s lreq %p\n", __func__, lreq);
1924 return lreq;
1925}
1926
1927DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
1928DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
1929DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
1930
1387/* 1931/*
1388 * Calculate mapping of a request to a PG. Takes tiering into account. 1932 * Create linger request <-> OSD session relation.
1933 *
1934 * @lreq has to be registered, @osd may be homeless.
1389 */ 1935 */
1390static int __calc_request_pg(struct ceph_osdmap *osdmap, 1936static void link_linger(struct ceph_osd *osd,
1391 struct ceph_osd_request *req, 1937 struct ceph_osd_linger_request *lreq)
1392 struct ceph_pg *pg_out)
1393{ 1938{
1394 bool need_check_tiering; 1939 verify_osd_locked(osd);
1940 WARN_ON(!lreq->linger_id || lreq->osd);
1941 dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
1942 osd->o_osd, lreq, lreq->linger_id);
1395 1943
1396 need_check_tiering = false; 1944 if (!osd_homeless(osd))
1397 if (req->r_target_oloc.pool == -1) { 1945 __remove_osd_from_lru(osd);
1398 req->r_target_oloc = req->r_base_oloc; /* struct */ 1946 else
1399 need_check_tiering = true; 1947 atomic_inc(&osd->o_osdc->num_homeless);
1948
1949 get_osd(osd);
1950 insert_linger(&osd->o_linger_requests, lreq);
1951 lreq->osd = osd;
1952}
1953
1954static void unlink_linger(struct ceph_osd *osd,
1955 struct ceph_osd_linger_request *lreq)
1956{
1957 verify_osd_locked(osd);
1958 WARN_ON(lreq->osd != osd);
1959 dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
1960 osd->o_osd, lreq, lreq->linger_id);
1961
1962 lreq->osd = NULL;
1963 erase_linger(&osd->o_linger_requests, lreq);
1964 put_osd(osd);
1965
1966 if (!osd_homeless(osd))
1967 maybe_move_osd_to_lru(osd);
1968 else
1969 atomic_dec(&osd->o_osdc->num_homeless);
1970}
1971
1972static bool __linger_registered(struct ceph_osd_linger_request *lreq)
1973{
1974 verify_osdc_locked(lreq->osdc);
1975
1976 return !RB_EMPTY_NODE(&lreq->osdc_node);
1977}
1978
1979static bool linger_registered(struct ceph_osd_linger_request *lreq)
1980{
1981 struct ceph_osd_client *osdc = lreq->osdc;
1982 bool registered;
1983
1984 down_read(&osdc->lock);
1985 registered = __linger_registered(lreq);
1986 up_read(&osdc->lock);
1987
1988 return registered;
1989}
1990
1991static void linger_register(struct ceph_osd_linger_request *lreq)
1992{
1993 struct ceph_osd_client *osdc = lreq->osdc;
1994
1995 verify_osdc_wrlocked(osdc);
1996 WARN_ON(lreq->linger_id);
1997
1998 linger_get(lreq);
1999 lreq->linger_id = ++osdc->last_linger_id;
2000 insert_linger_osdc(&osdc->linger_requests, lreq);
2001}
2002
2003static void linger_unregister(struct ceph_osd_linger_request *lreq)
2004{
2005 struct ceph_osd_client *osdc = lreq->osdc;
2006
2007 verify_osdc_wrlocked(osdc);
2008
2009 erase_linger_osdc(&osdc->linger_requests, lreq);
2010 linger_put(lreq);
2011}
2012
2013static void cancel_linger_request(struct ceph_osd_request *req)
2014{
2015 struct ceph_osd_linger_request *lreq = req->r_priv;
2016
2017 WARN_ON(!req->r_linger);
2018 cancel_request(req);
2019 linger_put(lreq);
2020}
2021
2022struct linger_work {
2023 struct work_struct work;
2024 struct ceph_osd_linger_request *lreq;
2025 struct list_head pending_item;
2026 unsigned long queued_stamp;
2027
2028 union {
2029 struct {
2030 u64 notify_id;
2031 u64 notifier_id;
2032 void *payload; /* points into @msg front */
2033 size_t payload_len;
2034
2035 struct ceph_msg *msg; /* for ceph_msg_put() */
2036 } notify;
2037 struct {
2038 int err;
2039 } error;
2040 };
2041};
2042
2043static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
2044 work_func_t workfn)
2045{
2046 struct linger_work *lwork;
2047
2048 lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
2049 if (!lwork)
2050 return NULL;
2051
2052 INIT_WORK(&lwork->work, workfn);
2053 INIT_LIST_HEAD(&lwork->pending_item);
2054 lwork->lreq = linger_get(lreq);
2055
2056 return lwork;
2057}
2058
2059static void lwork_free(struct linger_work *lwork)
2060{
2061 struct ceph_osd_linger_request *lreq = lwork->lreq;
2062
2063 mutex_lock(&lreq->lock);
2064 list_del(&lwork->pending_item);
2065 mutex_unlock(&lreq->lock);
2066
2067 linger_put(lreq);
2068 kfree(lwork);
2069}
2070
2071static void lwork_queue(struct linger_work *lwork)
2072{
2073 struct ceph_osd_linger_request *lreq = lwork->lreq;
2074 struct ceph_osd_client *osdc = lreq->osdc;
2075
2076 verify_lreq_locked(lreq);
2077 WARN_ON(!list_empty(&lwork->pending_item));
2078
2079 lwork->queued_stamp = jiffies;
2080 list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
2081 queue_work(osdc->notify_wq, &lwork->work);
2082}
2083
2084static void do_watch_notify(struct work_struct *w)
2085{
2086 struct linger_work *lwork = container_of(w, struct linger_work, work);
2087 struct ceph_osd_linger_request *lreq = lwork->lreq;
2088
2089 if (!linger_registered(lreq)) {
2090 dout("%s lreq %p not registered\n", __func__, lreq);
2091 goto out;
1400 } 2092 }
1401 if (req->r_target_oid.name_len == 0) { 2093
1402 ceph_oid_copy(&req->r_target_oid, &req->r_base_oid); 2094 WARN_ON(!lreq->is_watch);
1403 need_check_tiering = true; 2095 dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
2096 __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
2097 lwork->notify.payload_len);
2098 lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
2099 lwork->notify.notifier_id, lwork->notify.payload,
2100 lwork->notify.payload_len);
2101
2102out:
2103 ceph_msg_put(lwork->notify.msg);
2104 lwork_free(lwork);
2105}
2106
2107static void do_watch_error(struct work_struct *w)
2108{
2109 struct linger_work *lwork = container_of(w, struct linger_work, work);
2110 struct ceph_osd_linger_request *lreq = lwork->lreq;
2111
2112 if (!linger_registered(lreq)) {
2113 dout("%s lreq %p not registered\n", __func__, lreq);
2114 goto out;
1404 } 2115 }
1405 2116
1406 if (need_check_tiering && 2117 dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
1407 (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { 2118 lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
1408 struct ceph_pg_pool_info *pi; 2119
1409 2120out:
1410 pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool); 2121 lwork_free(lwork);
1411 if (pi) { 2122}
1412 if ((req->r_flags & CEPH_OSD_FLAG_READ) && 2123
1413 pi->read_tier >= 0) 2124static void queue_watch_error(struct ceph_osd_linger_request *lreq)
1414 req->r_target_oloc.pool = pi->read_tier; 2125{
1415 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && 2126 struct linger_work *lwork;
1416 pi->write_tier >= 0) 2127
1417 req->r_target_oloc.pool = pi->write_tier; 2128 lwork = lwork_alloc(lreq, do_watch_error);
2129 if (!lwork) {
2130 pr_err("failed to allocate error-lwork\n");
2131 return;
2132 }
2133
2134 lwork->error.err = lreq->last_error;
2135 lwork_queue(lwork);
2136}
2137
2138static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
2139 int result)
2140{
2141 if (!completion_done(&lreq->reg_commit_wait)) {
2142 lreq->reg_commit_error = (result <= 0 ? result : 0);
2143 complete_all(&lreq->reg_commit_wait);
2144 }
2145}
2146
2147static void linger_commit_cb(struct ceph_osd_request *req)
2148{
2149 struct ceph_osd_linger_request *lreq = req->r_priv;
2150
2151 mutex_lock(&lreq->lock);
2152 dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
2153 lreq->linger_id, req->r_result);
2154 WARN_ON(!__linger_registered(lreq));
2155 linger_reg_commit_complete(lreq, req->r_result);
2156 lreq->committed = true;
2157
2158 if (!lreq->is_watch) {
2159 struct ceph_osd_data *osd_data =
2160 osd_req_op_data(req, 0, notify, response_data);
2161 void *p = page_address(osd_data->pages[0]);
2162
2163 WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
2164 osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
2165
2166 /* make note of the notify_id */
2167 if (req->r_ops[0].outdata_len >= sizeof(u64)) {
2168 lreq->notify_id = ceph_decode_64(&p);
2169 dout("lreq %p notify_id %llu\n", lreq,
2170 lreq->notify_id);
2171 } else {
2172 dout("lreq %p no notify_id\n", lreq);
1418 } 2173 }
1419 /* !pi is caught in ceph_oloc_oid_to_pg() */
1420 } 2174 }
1421 2175
1422 return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc, 2176 mutex_unlock(&lreq->lock);
1423 &req->r_target_oid, pg_out); 2177 linger_put(lreq);
1424} 2178}
1425 2179
1426static void __enqueue_request(struct ceph_osd_request *req) 2180static int normalize_watch_error(int err)
1427{ 2181{
1428 struct ceph_osd_client *osdc = req->r_osdc; 2182 /*
2183 * Translate ENOENT -> ENOTCONN so that a delete->disconnection
2184 * notification and a failure to reconnect because we raced with
2185 * the delete appear the same to the user.
2186 */
2187 if (err == -ENOENT)
2188 err = -ENOTCONN;
2189
2190 return err;
2191}
2192
2193static void linger_reconnect_cb(struct ceph_osd_request *req)
2194{
2195 struct ceph_osd_linger_request *lreq = req->r_priv;
2196
2197 mutex_lock(&lreq->lock);
2198 dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
2199 lreq, lreq->linger_id, req->r_result, lreq->last_error);
2200 if (req->r_result < 0) {
2201 if (!lreq->last_error) {
2202 lreq->last_error = normalize_watch_error(req->r_result);
2203 queue_watch_error(lreq);
2204 }
2205 }
1429 2206
1430 dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid, 2207 mutex_unlock(&lreq->lock);
1431 req->r_osd ? req->r_osd->o_osd : -1); 2208 linger_put(lreq);
2209}
2210
2211static void send_linger(struct ceph_osd_linger_request *lreq)
2212{
2213 struct ceph_osd_request *req = lreq->reg_req;
2214 struct ceph_osd_req_op *op = &req->r_ops[0];
1432 2215
1433 if (req->r_osd) { 2216 verify_osdc_wrlocked(req->r_osdc);
1434 __remove_osd_from_lru(req->r_osd); 2217 dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
1435 list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); 2218
1436 list_move_tail(&req->r_req_lru_item, &osdc->req_unsent); 2219 if (req->r_osd)
2220 cancel_linger_request(req);
2221
2222 request_reinit(req);
2223 ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
2224 ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
2225 req->r_flags = lreq->t.flags;
2226 req->r_mtime = lreq->mtime;
2227
2228 mutex_lock(&lreq->lock);
2229 if (lreq->is_watch && lreq->committed) {
2230 WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
2231 op->watch.cookie != lreq->linger_id);
2232 op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
2233 op->watch.gen = ++lreq->register_gen;
2234 dout("lreq %p reconnect register_gen %u\n", lreq,
2235 op->watch.gen);
2236 req->r_callback = linger_reconnect_cb;
1437 } else { 2237 } else {
1438 list_move_tail(&req->r_req_lru_item, &osdc->req_notarget); 2238 if (!lreq->is_watch)
2239 lreq->notify_id = 0;
2240 else
2241 WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
2242 dout("lreq %p register\n", lreq);
2243 req->r_callback = linger_commit_cb;
1439 } 2244 }
2245 mutex_unlock(&lreq->lock);
2246
2247 req->r_priv = linger_get(lreq);
2248 req->r_linger = true;
2249
2250 submit_request(req, true);
1440} 2251}
1441 2252
1442/* 2253static void linger_ping_cb(struct ceph_osd_request *req)
1443 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
1444 * (as needed), and set the request r_osd appropriately. If there is
1445 * no up osd, set r_osd to NULL. Move the request to the appropriate list
1446 * (unsent, homeless) or leave on in-flight lru.
1447 *
1448 * Return 0 if unchanged, 1 if changed, or negative on error.
1449 *
1450 * Caller should hold map_sem for read and request_mutex.
1451 */
1452static int __map_request(struct ceph_osd_client *osdc,
1453 struct ceph_osd_request *req, int force_resend)
1454{ 2254{
1455 struct ceph_pg pgid; 2255 struct ceph_osd_linger_request *lreq = req->r_priv;
1456 int acting[CEPH_PG_MAX_SIZE]; 2256
1457 int num, o; 2257 mutex_lock(&lreq->lock);
1458 int err; 2258 dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
1459 bool was_paused; 2259 __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
1460 2260 lreq->last_error);
1461 dout("map_request %p tid %lld\n", req, req->r_tid); 2261 if (lreq->register_gen == req->r_ops[0].watch.gen) {
1462 2262 if (!req->r_result) {
1463 err = __calc_request_pg(osdc->osdmap, req, &pgid); 2263 lreq->watch_valid_thru = lreq->ping_sent;
1464 if (err) { 2264 } else if (!lreq->last_error) {
1465 list_move(&req->r_req_lru_item, &osdc->req_notarget); 2265 lreq->last_error = normalize_watch_error(req->r_result);
1466 return err; 2266 queue_watch_error(lreq);
1467 }
1468 req->r_pgid = pgid;
1469
1470 num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
1471 if (num < 0)
1472 num = 0;
1473
1474 was_paused = req->r_paused;
1475 req->r_paused = __req_should_be_paused(osdc, req);
1476 if (was_paused && !req->r_paused)
1477 force_resend = 1;
1478
1479 if ((!force_resend &&
1480 req->r_osd && req->r_osd->o_osd == o &&
1481 req->r_sent >= req->r_osd->o_incarnation &&
1482 req->r_num_pg_osds == num &&
1483 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
1484 (req->r_osd == NULL && o == -1) ||
1485 req->r_paused)
1486 return 0; /* no change */
1487
1488 dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
1489 req->r_tid, pgid.pool, pgid.seed, o,
1490 req->r_osd ? req->r_osd->o_osd : -1);
1491
1492 /* record full pg acting set */
1493 memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
1494 req->r_num_pg_osds = num;
1495
1496 if (req->r_osd) {
1497 __cancel_request(req);
1498 list_del_init(&req->r_osd_item);
1499 list_del_init(&req->r_linger_osd_item);
1500 req->r_osd = NULL;
1501 }
1502
1503 req->r_osd = __lookup_osd(osdc, o);
1504 if (!req->r_osd && o >= 0) {
1505 err = -ENOMEM;
1506 req->r_osd = create_osd(osdc, o);
1507 if (!req->r_osd) {
1508 list_move(&req->r_req_lru_item, &osdc->req_notarget);
1509 goto out;
1510 } 2267 }
2268 } else {
2269 dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
2270 lreq->register_gen, req->r_ops[0].watch.gen);
2271 }
1511 2272
1512 dout("map_request osd %p is osd%d\n", req->r_osd, o); 2273 mutex_unlock(&lreq->lock);
1513 __insert_osd(osdc, req->r_osd); 2274 linger_put(lreq);
2275}
2276
2277static void send_linger_ping(struct ceph_osd_linger_request *lreq)
2278{
2279 struct ceph_osd_client *osdc = lreq->osdc;
2280 struct ceph_osd_request *req = lreq->ping_req;
2281 struct ceph_osd_req_op *op = &req->r_ops[0];
1514 2282
1515 ceph_con_open(&req->r_osd->o_con, 2283 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
1516 CEPH_ENTITY_TYPE_OSD, o, 2284 dout("%s PAUSERD\n", __func__);
1517 &osdc->osdmap->osd_addr[o]); 2285 return;
1518 } 2286 }
1519 2287
1520 __enqueue_request(req); 2288 lreq->ping_sent = jiffies;
1521 err = 1; /* osd or pg changed */ 2289 dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
2290 __func__, lreq, lreq->linger_id, lreq->ping_sent,
2291 lreq->register_gen);
1522 2292
1523out: 2293 if (req->r_osd)
1524 return err; 2294 cancel_linger_request(req);
2295
2296 request_reinit(req);
2297 target_copy(&req->r_t, &lreq->t);
2298
2299 WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
2300 op->watch.cookie != lreq->linger_id ||
2301 op->watch.op != CEPH_OSD_WATCH_OP_PING);
2302 op->watch.gen = lreq->register_gen;
2303 req->r_callback = linger_ping_cb;
2304 req->r_priv = linger_get(lreq);
2305 req->r_linger = true;
2306
2307 ceph_osdc_get_request(req);
2308 account_request(req);
2309 req->r_tid = atomic64_inc_return(&osdc->last_tid);
2310 link_request(lreq->osd, req);
2311 send_request(req);
1525} 2312}
1526 2313
1527/* 2314static void linger_submit(struct ceph_osd_linger_request *lreq)
1528 * caller should hold map_sem (for read) and request_mutex
1529 */
1530static void __send_request(struct ceph_osd_client *osdc,
1531 struct ceph_osd_request *req)
1532{ 2315{
1533 void *p; 2316 struct ceph_osd_client *osdc = lreq->osdc;
2317 struct ceph_osd *osd;
1534 2318
1535 dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n", 2319 calc_target(osdc, &lreq->t, &lreq->last_force_resend, false);
1536 req, req->r_tid, req->r_osd->o_osd, req->r_flags, 2320 osd = lookup_create_osd(osdc, lreq->t.osd, true);
1537 (unsigned long long)req->r_pgid.pool, req->r_pgid.seed); 2321 link_linger(osd, lreq);
1538 2322
1539 /* fill in message content that changes each time we send it */ 2323 send_linger(lreq);
1540 put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); 2324}
1541 put_unaligned_le32(req->r_flags, req->r_request_flags);
1542 put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
1543 p = req->r_request_pgid;
1544 ceph_encode_64(&p, req->r_pgid.pool);
1545 ceph_encode_32(&p, req->r_pgid.seed);
1546 put_unaligned_le64(1, req->r_request_attempts); /* FIXME */
1547 memcpy(req->r_request_reassert_version, &req->r_reassert_version,
1548 sizeof(req->r_reassert_version));
1549 2325
1550 req->r_stamp = jiffies; 2326static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
1551 list_move_tail(&req->r_req_lru_item, &osdc->req_lru); 2327{
2328 struct ceph_osd_client *osdc = lreq->osdc;
2329 struct ceph_osd_linger_request *lookup_lreq;
1552 2330
1553 ceph_msg_get(req->r_request); /* send consumes a ref */ 2331 verify_osdc_wrlocked(osdc);
1554 2332
1555 req->r_sent = req->r_osd->o_incarnation; 2333 lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
2334 lreq->linger_id);
2335 if (!lookup_lreq)
2336 return;
1556 2337
1557 ceph_con_send(&req->r_osd->o_con, req->r_request); 2338 WARN_ON(lookup_lreq != lreq);
2339 erase_linger_mc(&osdc->linger_map_checks, lreq);
2340 linger_put(lreq);
1558} 2341}
1559 2342
1560/* 2343/*
1561 * Send any requests in the queue (req_unsent). 2344 * @lreq has to be both registered and linked.
1562 */ 2345 */
1563static void __send_queued(struct ceph_osd_client *osdc) 2346static void __linger_cancel(struct ceph_osd_linger_request *lreq)
2347{
2348 if (lreq->is_watch && lreq->ping_req->r_osd)
2349 cancel_linger_request(lreq->ping_req);
2350 if (lreq->reg_req->r_osd)
2351 cancel_linger_request(lreq->reg_req);
2352 cancel_linger_map_check(lreq);
2353 unlink_linger(lreq->osd, lreq);
2354 linger_unregister(lreq);
2355}
2356
2357static void linger_cancel(struct ceph_osd_linger_request *lreq)
1564{ 2358{
1565 struct ceph_osd_request *req, *tmp; 2359 struct ceph_osd_client *osdc = lreq->osdc;
1566 2360
1567 dout("__send_queued\n"); 2361 down_write(&osdc->lock);
1568 list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) 2362 if (__linger_registered(lreq))
1569 __send_request(osdc, req); 2363 __linger_cancel(lreq);
2364 up_write(&osdc->lock);
1570} 2365}
1571 2366
1572/* 2367static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
1573 * Caller should hold map_sem for read and request_mutex. 2368
1574 */ 2369static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
1575static int __ceph_osdc_start_request(struct ceph_osd_client *osdc, 2370{
1576 struct ceph_osd_request *req, 2371 struct ceph_osd_client *osdc = lreq->osdc;
1577 bool nofail) 2372 struct ceph_osdmap *map = osdc->osdmap;
1578{ 2373
1579 int rc; 2374 verify_osdc_wrlocked(osdc);
1580 2375 WARN_ON(!map->epoch);
1581 __register_request(osdc, req); 2376
1582 req->r_sent = 0; 2377 if (lreq->register_gen) {
1583 req->r_got_reply = 0; 2378 lreq->map_dne_bound = map->epoch;
1584 rc = __map_request(osdc, req, 0); 2379 dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
1585 if (rc < 0) { 2380 lreq, lreq->linger_id);
1586 if (nofail) { 2381 } else {
1587 dout("osdc_start_request failed map, " 2382 dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
1588 " will retry %lld\n", req->r_tid); 2383 __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
1589 rc = 0; 2384 map->epoch);
1590 } else {
1591 __unregister_request(osdc, req);
1592 }
1593 return rc;
1594 } 2385 }
1595 2386
1596 if (req->r_osd == NULL) { 2387 if (lreq->map_dne_bound) {
1597 dout("send_request %p no up osds in pg\n", req); 2388 if (map->epoch >= lreq->map_dne_bound) {
1598 ceph_monc_request_next_osdmap(&osdc->client->monc); 2389 /* we had a new enough map */
2390 pr_info("linger_id %llu pool does not exist\n",
2391 lreq->linger_id);
2392 linger_reg_commit_complete(lreq, -ENOENT);
2393 __linger_cancel(lreq);
2394 }
1599 } else { 2395 } else {
1600 __send_queued(osdc); 2396 send_linger_map_check(lreq);
1601 } 2397 }
2398}
1602 2399
1603 return 0; 2400static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
2401{
2402 struct ceph_osd_client *osdc = &greq->monc->client->osdc;
2403 struct ceph_osd_linger_request *lreq;
2404 u64 linger_id = greq->private_data;
2405
2406 WARN_ON(greq->result || !greq->u.newest);
2407
2408 down_write(&osdc->lock);
2409 lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
2410 if (!lreq) {
2411 dout("%s linger_id %llu dne\n", __func__, linger_id);
2412 goto out_unlock;
2413 }
2414
2415 dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
2416 __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
2417 greq->u.newest);
2418 if (!lreq->map_dne_bound)
2419 lreq->map_dne_bound = greq->u.newest;
2420 erase_linger_mc(&osdc->linger_map_checks, lreq);
2421 check_linger_pool_dne(lreq);
2422
2423 linger_put(lreq);
2424out_unlock:
2425 up_write(&osdc->lock);
2426}
2427
2428static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
2429{
2430 struct ceph_osd_client *osdc = lreq->osdc;
2431 struct ceph_osd_linger_request *lookup_lreq;
2432 int ret;
2433
2434 verify_osdc_wrlocked(osdc);
2435
2436 lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
2437 lreq->linger_id);
2438 if (lookup_lreq) {
2439 WARN_ON(lookup_lreq != lreq);
2440 return;
2441 }
2442
2443 linger_get(lreq);
2444 insert_linger_mc(&osdc->linger_map_checks, lreq);
2445 ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
2446 linger_map_check_cb, lreq->linger_id);
2447 WARN_ON(ret);
2448}
2449
2450static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
2451{
2452 int ret;
2453
2454 dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
2455 ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
2456 return ret ?: lreq->reg_commit_error;
2457}
2458
2459static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
2460{
2461 int ret;
2462
2463 dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
2464 ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
2465 return ret ?: lreq->notify_finish_error;
1604} 2466}
1605 2467
1606/* 2468/*
1607 * Timeout callback, called every N seconds when 1 or more osd 2469 * Timeout callback, called every N seconds. When 1 or more OSD
1608 * requests has been active for more than N seconds. When this 2470 * requests has been active for more than N seconds, we send a keepalive
1609 * happens, we ping all OSDs with requests who have timed out to 2471 * (tag + timestamp) to its OSD to ensure any communications channel
1610 * ensure any communications channel reset is detected. Reset the 2472 * reset is detected.
1611 * request timeouts another N seconds in the future as we go.
1612 * Reschedule the timeout event another N seconds in future (unless
1613 * there are no open requests).
1614 */ 2473 */
1615static void handle_timeout(struct work_struct *work) 2474static void handle_timeout(struct work_struct *work)
1616{ 2475{
1617 struct ceph_osd_client *osdc = 2476 struct ceph_osd_client *osdc =
1618 container_of(work, struct ceph_osd_client, timeout_work.work); 2477 container_of(work, struct ceph_osd_client, timeout_work.work);
1619 struct ceph_options *opts = osdc->client->options; 2478 struct ceph_options *opts = osdc->client->options;
1620 struct ceph_osd_request *req; 2479 unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
1621 struct ceph_osd *osd; 2480 LIST_HEAD(slow_osds);
1622 struct list_head slow_osds; 2481 struct rb_node *n, *p;
1623 dout("timeout\n");
1624 down_read(&osdc->map_sem);
1625
1626 ceph_monc_request_next_osdmap(&osdc->client->monc);
1627 2482
1628 mutex_lock(&osdc->request_mutex); 2483 dout("%s osdc %p\n", __func__, osdc);
2484 down_write(&osdc->lock);
1629 2485
1630 /* 2486 /*
1631 * ping osds that are a bit slow. this ensures that if there 2487 * ping osds that are a bit slow. this ensures that if there
1632 * is a break in the TCP connection we will notice, and reopen 2488 * is a break in the TCP connection we will notice, and reopen
1633 * a connection with that osd (from the fault callback). 2489 * a connection with that osd (from the fault callback).
1634 */ 2490 */
1635 INIT_LIST_HEAD(&slow_osds); 2491 for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
1636 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { 2492 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
1637 if (time_before(jiffies, 2493 bool found = false;
1638 req->r_stamp + opts->osd_keepalive_timeout)) 2494
1639 break; 2495 for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
2496 struct ceph_osd_request *req =
2497 rb_entry(p, struct ceph_osd_request, r_node);
2498
2499 if (time_before(req->r_stamp, cutoff)) {
2500 dout(" req %p tid %llu on osd%d is laggy\n",
2501 req, req->r_tid, osd->o_osd);
2502 found = true;
2503 }
2504 }
2505 for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
2506 struct ceph_osd_linger_request *lreq =
2507 rb_entry(p, struct ceph_osd_linger_request, node);
2508
2509 dout(" lreq %p linger_id %llu is served by osd%d\n",
2510 lreq, lreq->linger_id, osd->o_osd);
2511 found = true;
2512
2513 mutex_lock(&lreq->lock);
2514 if (lreq->is_watch && lreq->committed && !lreq->last_error)
2515 send_linger_ping(lreq);
2516 mutex_unlock(&lreq->lock);
2517 }
1640 2518
1641 osd = req->r_osd; 2519 if (found)
1642 BUG_ON(!osd); 2520 list_move_tail(&osd->o_keepalive_item, &slow_osds);
1643 dout(" tid %llu is slow, will send keepalive on osd%d\n",
1644 req->r_tid, osd->o_osd);
1645 list_move_tail(&osd->o_keepalive_item, &slow_osds);
1646 } 2521 }
2522
2523 if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
2524 maybe_request_map(osdc);
2525
1647 while (!list_empty(&slow_osds)) { 2526 while (!list_empty(&slow_osds)) {
1648 osd = list_entry(slow_osds.next, struct ceph_osd, 2527 struct ceph_osd *osd = list_first_entry(&slow_osds,
1649 o_keepalive_item); 2528 struct ceph_osd,
2529 o_keepalive_item);
1650 list_del_init(&osd->o_keepalive_item); 2530 list_del_init(&osd->o_keepalive_item);
1651 ceph_con_keepalive(&osd->o_con); 2531 ceph_con_keepalive(&osd->o_con);
1652 } 2532 }
1653 2533
1654 __schedule_osd_timeout(osdc); 2534 up_write(&osdc->lock);
1655 __send_queued(osdc); 2535 schedule_delayed_work(&osdc->timeout_work,
1656 mutex_unlock(&osdc->request_mutex); 2536 osdc->client->options->osd_keepalive_timeout);
1657 up_read(&osdc->map_sem);
1658} 2537}
1659 2538
1660static void handle_osds_timeout(struct work_struct *work) 2539static void handle_osds_timeout(struct work_struct *work)
@@ -1663,12 +2542,20 @@ static void handle_osds_timeout(struct work_struct *work)
1663 container_of(work, struct ceph_osd_client, 2542 container_of(work, struct ceph_osd_client,
1664 osds_timeout_work.work); 2543 osds_timeout_work.work);
1665 unsigned long delay = osdc->client->options->osd_idle_ttl / 4; 2544 unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
2545 struct ceph_osd *osd, *nosd;
1666 2546
1667 dout("osds timeout\n"); 2547 dout("%s osdc %p\n", __func__, osdc);
1668 down_read(&osdc->map_sem); 2548 down_write(&osdc->lock);
1669 remove_old_osds(osdc); 2549 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
1670 up_read(&osdc->map_sem); 2550 if (time_before(jiffies, osd->lru_ttl))
2551 break;
1671 2552
2553 WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
2554 WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
2555 close_osd(osd);
2556 }
2557
2558 up_write(&osdc->lock);
1672 schedule_delayed_work(&osdc->osds_timeout_work, 2559 schedule_delayed_work(&osdc->osds_timeout_work,
1673 round_jiffies_relative(delay)); 2560 round_jiffies_relative(delay));
1674} 2561}
@@ -1776,107 +2663,76 @@ e_inval:
1776 goto out; 2663 goto out;
1777} 2664}
1778 2665
1779static void complete_request(struct ceph_osd_request *req) 2666struct MOSDOpReply {
1780{ 2667 struct ceph_pg pgid;
1781 complete_all(&req->r_safe_completion); /* fsync waiter */ 2668 u64 flags;
1782} 2669 int result;
2670 u32 epoch;
2671 int num_ops;
2672 u32 outdata_len[CEPH_OSD_MAX_OPS];
2673 s32 rval[CEPH_OSD_MAX_OPS];
2674 int retry_attempt;
2675 struct ceph_eversion replay_version;
2676 u64 user_version;
2677 struct ceph_request_redirect redirect;
2678};
1783 2679
1784/* 2680static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
1785 * handle osd op reply. either call the callback if it is specified,
1786 * or do the completion to wake up the waiting thread.
1787 */
1788static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1789{ 2681{
1790 void *p, *end; 2682 void *p = msg->front.iov_base;
1791 struct ceph_osd_request *req; 2683 void *const end = p + msg->front.iov_len;
1792 struct ceph_request_redirect redir; 2684 u16 version = le16_to_cpu(msg->hdr.version);
1793 u64 tid; 2685 struct ceph_eversion bad_replay_version;
1794 int object_len;
1795 unsigned int numops;
1796 int payload_len, flags;
1797 s32 result;
1798 s32 retry_attempt;
1799 struct ceph_pg pg;
1800 int err;
1801 u32 reassert_epoch;
1802 u64 reassert_version;
1803 u32 osdmap_epoch;
1804 int already_completed;
1805 u32 bytes;
1806 u8 decode_redir; 2686 u8 decode_redir;
1807 unsigned int i; 2687 u32 len;
1808 2688 int ret;
1809 tid = le64_to_cpu(msg->hdr.tid); 2689 int i;
1810 dout("handle_reply %p tid %llu\n", msg, tid);
1811 2690
1812 p = msg->front.iov_base; 2691 ceph_decode_32_safe(&p, end, len, e_inval);
1813 end = p + msg->front.iov_len; 2692 ceph_decode_need(&p, end, len, e_inval);
2693 p += len; /* skip oid */
1814 2694
1815 ceph_decode_need(&p, end, 4, bad); 2695 ret = ceph_decode_pgid(&p, end, &m->pgid);
1816 object_len = ceph_decode_32(&p); 2696 if (ret)
1817 ceph_decode_need(&p, end, object_len, bad); 2697 return ret;
1818 p += object_len;
1819 2698
1820 err = ceph_decode_pgid(&p, end, &pg); 2699 ceph_decode_64_safe(&p, end, m->flags, e_inval);
1821 if (err) 2700 ceph_decode_32_safe(&p, end, m->result, e_inval);
1822 goto bad; 2701 ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
2702 memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
2703 p += sizeof(bad_replay_version);
2704 ceph_decode_32_safe(&p, end, m->epoch, e_inval);
1823 2705
1824 ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad); 2706 ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
1825 flags = ceph_decode_64(&p); 2707 if (m->num_ops > ARRAY_SIZE(m->outdata_len))
1826 result = ceph_decode_32(&p); 2708 goto e_inval;
1827 reassert_epoch = ceph_decode_32(&p);
1828 reassert_version = ceph_decode_64(&p);
1829 osdmap_epoch = ceph_decode_32(&p);
1830
1831 /* lookup */
1832 down_read(&osdc->map_sem);
1833 mutex_lock(&osdc->request_mutex);
1834 req = __lookup_request(osdc, tid);
1835 if (req == NULL) {
1836 dout("handle_reply tid %llu dne\n", tid);
1837 goto bad_mutex;
1838 }
1839 ceph_osdc_get_request(req);
1840 2709
1841 dout("handle_reply %p tid %llu req %p result %d\n", msg, tid, 2710 ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
1842 req, result); 2711 e_inval);
1843 2712 for (i = 0; i < m->num_ops; i++) {
1844 ceph_decode_need(&p, end, 4, bad_put);
1845 numops = ceph_decode_32(&p);
1846 if (numops > CEPH_OSD_MAX_OPS)
1847 goto bad_put;
1848 if (numops != req->r_num_ops)
1849 goto bad_put;
1850 payload_len = 0;
1851 ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
1852 for (i = 0; i < numops; i++) {
1853 struct ceph_osd_op *op = p; 2713 struct ceph_osd_op *op = p;
1854 int len;
1855 2714
1856 len = le32_to_cpu(op->payload_len); 2715 m->outdata_len[i] = le32_to_cpu(op->payload_len);
1857 req->r_ops[i].outdata_len = len;
1858 dout(" op %d has %d bytes\n", i, len);
1859 payload_len += len;
1860 p += sizeof(*op); 2716 p += sizeof(*op);
1861 } 2717 }
1862 bytes = le32_to_cpu(msg->hdr.data_len);
1863 if (payload_len != bytes) {
1864 pr_warn("sum of op payload lens %d != data_len %d\n",
1865 payload_len, bytes);
1866 goto bad_put;
1867 }
1868 2718
1869 ceph_decode_need(&p, end, 4 + numops * 4, bad_put); 2719 ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
1870 retry_attempt = ceph_decode_32(&p); 2720 for (i = 0; i < m->num_ops; i++)
1871 for (i = 0; i < numops; i++) 2721 ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
1872 req->r_ops[i].rval = ceph_decode_32(&p);
1873 2722
1874 if (le16_to_cpu(msg->hdr.version) >= 6) { 2723 if (version >= 5) {
1875 p += 8 + 4; /* skip replay_version */ 2724 ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
1876 p += 8; /* skip user_version */ 2725 memcpy(&m->replay_version, p, sizeof(m->replay_version));
2726 p += sizeof(m->replay_version);
2727 ceph_decode_64_safe(&p, end, m->user_version, e_inval);
2728 } else {
2729 m->replay_version = bad_replay_version; /* struct */
2730 m->user_version = le64_to_cpu(m->replay_version.version);
2731 }
1877 2732
1878 if (le16_to_cpu(msg->hdr.version) >= 7) 2733 if (version >= 6) {
1879 ceph_decode_8_safe(&p, end, decode_redir, bad_put); 2734 if (version >= 7)
2735 ceph_decode_8_safe(&p, end, decode_redir, e_inval);
1880 else 2736 else
1881 decode_redir = 1; 2737 decode_redir = 1;
1882 } else { 2738 } else {
@@ -1884,228 +2740,410 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1884 } 2740 }
1885 2741
1886 if (decode_redir) { 2742 if (decode_redir) {
1887 err = ceph_redirect_decode(&p, end, &redir); 2743 ret = ceph_redirect_decode(&p, end, &m->redirect);
1888 if (err) 2744 if (ret)
1889 goto bad_put; 2745 return ret;
1890 } else { 2746 } else {
1891 redir.oloc.pool = -1; 2747 ceph_oloc_init(&m->redirect.oloc);
1892 } 2748 }
1893 2749
1894 if (redir.oloc.pool != -1) { 2750 return 0;
1895 dout("redirect pool %lld\n", redir.oloc.pool);
1896
1897 __unregister_request(osdc, req);
1898
1899 req->r_target_oloc = redir.oloc; /* struct */
1900 2751
1901 /* 2752e_inval:
1902 * Start redirect requests with nofail=true. If 2753 return -EINVAL;
1903 * mapping fails, request will end up on the notarget 2754}
1904 * list, waiting for the new osdmap (which can take
1905 * a while), even though the original request mapped
1906 * successfully. In the future we might want to follow
1907 * original request's nofail setting here.
1908 */
1909 err = __ceph_osdc_start_request(osdc, req, true);
1910 BUG_ON(err);
1911 2755
1912 goto out_unlock; 2756/*
1913 } 2757 * We are done with @req if
2758 * - @m is a safe reply, or
2759 * - @m is an unsafe reply and we didn't want a safe one
2760 */
2761static bool done_request(const struct ceph_osd_request *req,
2762 const struct MOSDOpReply *m)
2763{
2764 return (m->result < 0 ||
2765 (m->flags & CEPH_OSD_FLAG_ONDISK) ||
2766 !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
2767}
1914 2768
1915 already_completed = req->r_got_reply; 2769/*
1916 if (!req->r_got_reply) { 2770 * handle osd op reply. either call the callback if it is specified,
1917 req->r_result = result; 2771 * or do the completion to wake up the waiting thread.
1918 dout("handle_reply result %d bytes %d\n", req->r_result, 2772 *
1919 bytes); 2773 * ->r_unsafe_callback is set? yes no
1920 if (req->r_result == 0) 2774 *
1921 req->r_result = bytes; 2775 * first reply is OK (needed r_cb/r_completion, r_cb/r_completion,
2776 * any or needed/got safe) r_safe_completion r_safe_completion
2777 *
2778 * first reply is unsafe r_unsafe_cb(true) (nothing)
2779 *
2780 * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion,
2781 * r_safe_completion r_safe_completion
2782 */
2783static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
2784{
2785 struct ceph_osd_client *osdc = osd->o_osdc;
2786 struct ceph_osd_request *req;
2787 struct MOSDOpReply m;
2788 u64 tid = le64_to_cpu(msg->hdr.tid);
2789 u32 data_len = 0;
2790 bool already_acked;
2791 int ret;
2792 int i;
1922 2793
1923 /* in case this is a write and we need to replay, */ 2794 dout("%s msg %p tid %llu\n", __func__, msg, tid);
1924 req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
1925 req->r_reassert_version.version = cpu_to_le64(reassert_version);
1926 2795
1927 req->r_got_reply = 1; 2796 down_read(&osdc->lock);
1928 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { 2797 if (!osd_registered(osd)) {
1929 dout("handle_reply tid %llu dup ack\n", tid); 2798 dout("%s osd%d unknown\n", __func__, osd->o_osd);
1930 goto out_unlock; 2799 goto out_unlock_osdc;
1931 } 2800 }
2801 WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
1932 2802
1933 dout("handle_reply tid %llu flags %d\n", tid, flags); 2803 mutex_lock(&osd->lock);
2804 req = lookup_request(&osd->o_requests, tid);
2805 if (!req) {
2806 dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
2807 goto out_unlock_session;
2808 }
1934 2809
1935 if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) 2810 ret = decode_MOSDOpReply(msg, &m);
1936 __register_linger_request(osdc, req); 2811 if (ret) {
2812 pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
2813 req->r_tid, ret);
2814 ceph_msg_dump(msg);
2815 goto fail_request;
2816 }
2817 dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
2818 __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
2819 m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
2820 le64_to_cpu(m.replay_version.version), m.user_version);
2821
2822 if (m.retry_attempt >= 0) {
2823 if (m.retry_attempt != req->r_attempts - 1) {
2824 dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
2825 req, req->r_tid, m.retry_attempt,
2826 req->r_attempts - 1);
2827 goto out_unlock_session;
2828 }
2829 } else {
2830 WARN_ON(1); /* MOSDOpReply v4 is assumed */
2831 }
1937 2832
1938 /* either this is a read, or we got the safe response */ 2833 if (!ceph_oloc_empty(&m.redirect.oloc)) {
1939 if (result < 0 || 2834 dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
1940 (flags & CEPH_OSD_FLAG_ONDISK) || 2835 m.redirect.oloc.pool);
1941 ((flags & CEPH_OSD_FLAG_WRITE) == 0)) 2836 unlink_request(osd, req);
1942 __unregister_request(osdc, req); 2837 mutex_unlock(&osd->lock);
2838
2839 ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
2840 req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
2841 req->r_tid = 0;
2842 __submit_request(req, false);
2843 goto out_unlock_osdc;
2844 }
1943 2845
1944 mutex_unlock(&osdc->request_mutex); 2846 if (m.num_ops != req->r_num_ops) {
1945 up_read(&osdc->map_sem); 2847 pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
2848 req->r_num_ops, req->r_tid);
2849 goto fail_request;
2850 }
2851 for (i = 0; i < req->r_num_ops; i++) {
2852 dout(" req %p tid %llu op %d rval %d len %u\n", req,
2853 req->r_tid, i, m.rval[i], m.outdata_len[i]);
2854 req->r_ops[i].rval = m.rval[i];
2855 req->r_ops[i].outdata_len = m.outdata_len[i];
2856 data_len += m.outdata_len[i];
2857 }
2858 if (data_len != le32_to_cpu(msg->hdr.data_len)) {
2859 pr_err("sum of lens %u != %u for tid %llu\n", data_len,
2860 le32_to_cpu(msg->hdr.data_len), req->r_tid);
2861 goto fail_request;
2862 }
2863 dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
2864 req, req->r_tid, req->r_got_reply, m.result, data_len);
2865
2866 already_acked = req->r_got_reply;
2867 if (!already_acked) {
2868 req->r_result = m.result ?: data_len;
2869 req->r_replay_version = m.replay_version; /* struct */
2870 req->r_got_reply = true;
2871 } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
2872 dout("req %p tid %llu dup ack\n", req, req->r_tid);
2873 goto out_unlock_session;
2874 }
1946 2875
1947 if (!already_completed) { 2876 if (done_request(req, &m)) {
1948 if (req->r_unsafe_callback && 2877 __finish_request(req);
1949 result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK)) 2878 if (req->r_linger) {
1950 req->r_unsafe_callback(req, true); 2879 WARN_ON(req->r_unsafe_callback);
1951 if (req->r_callback) 2880 dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
1952 req->r_callback(req, msg); 2881 __complete_request(req);
1953 else 2882 }
1954 complete_all(&req->r_completion);
1955 } 2883 }
1956 2884
1957 if (flags & CEPH_OSD_FLAG_ONDISK) { 2885 mutex_unlock(&osd->lock);
1958 if (req->r_unsafe_callback && already_completed) 2886 up_read(&osdc->lock);
2887
2888 if (done_request(req, &m)) {
2889 if (already_acked && req->r_unsafe_callback) {
2890 dout("req %p tid %llu safe-cb\n", req, req->r_tid);
1959 req->r_unsafe_callback(req, false); 2891 req->r_unsafe_callback(req, false);
1960 complete_request(req); 2892 } else if (!req->r_linger) {
2893 dout("req %p tid %llu cb\n", req, req->r_tid);
2894 __complete_request(req);
2895 }
2896 } else {
2897 if (req->r_unsafe_callback) {
2898 dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
2899 req->r_unsafe_callback(req, true);
2900 } else {
2901 WARN_ON(1);
2902 }
1961 } 2903 }
2904 if (m.flags & CEPH_OSD_FLAG_ONDISK)
2905 complete_all(&req->r_safe_completion);
1962 2906
1963out:
1964 dout("req=%p req->r_linger=%d\n", req, req->r_linger);
1965 ceph_osdc_put_request(req); 2907 ceph_osdc_put_request(req);
1966 return; 2908 return;
1967out_unlock:
1968 mutex_unlock(&osdc->request_mutex);
1969 up_read(&osdc->map_sem);
1970 goto out;
1971 2909
1972bad_put: 2910fail_request:
1973 req->r_result = -EIO; 2911 complete_request(req, -EIO);
1974 __unregister_request(osdc, req); 2912out_unlock_session:
1975 if (req->r_callback) 2913 mutex_unlock(&osd->lock);
1976 req->r_callback(req, msg); 2914out_unlock_osdc:
1977 else 2915 up_read(&osdc->lock);
1978 complete_all(&req->r_completion);
1979 complete_request(req);
1980 ceph_osdc_put_request(req);
1981bad_mutex:
1982 mutex_unlock(&osdc->request_mutex);
1983 up_read(&osdc->map_sem);
1984bad:
1985 pr_err("corrupt osd_op_reply got %d %d\n",
1986 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
1987 ceph_msg_dump(msg);
1988} 2916}
1989 2917
1990static void reset_changed_osds(struct ceph_osd_client *osdc) 2918static void set_pool_was_full(struct ceph_osd_client *osdc)
1991{ 2919{
1992 struct rb_node *p, *n; 2920 struct rb_node *n;
1993 2921
1994 dout("%s %p\n", __func__, osdc); 2922 for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
1995 for (p = rb_first(&osdc->osds); p; p = n) { 2923 struct ceph_pg_pool_info *pi =
1996 struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); 2924 rb_entry(n, struct ceph_pg_pool_info, node);
1997 2925
1998 n = rb_next(p); 2926 pi->was_full = __pool_full(pi);
1999 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
2000 memcmp(&osd->o_con.peer_addr,
2001 ceph_osd_addr(osdc->osdmap,
2002 osd->o_osd),
2003 sizeof(struct ceph_entity_addr)) != 0)
2004 __reset_osd(osdc, osd);
2005 } 2927 }
2006} 2928}
2007 2929
2930static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
2931{
2932 struct ceph_pg_pool_info *pi;
2933
2934 pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
2935 if (!pi)
2936 return false;
2937
2938 return pi->was_full && !__pool_full(pi);
2939}
2940
2941static enum calc_target_result
2942recalc_linger_target(struct ceph_osd_linger_request *lreq)
2943{
2944 struct ceph_osd_client *osdc = lreq->osdc;
2945 enum calc_target_result ct_res;
2946
2947 ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true);
2948 if (ct_res == CALC_TARGET_NEED_RESEND) {
2949 struct ceph_osd *osd;
2950
2951 osd = lookup_create_osd(osdc, lreq->t.osd, true);
2952 if (osd != lreq->osd) {
2953 unlink_linger(lreq->osd, lreq);
2954 link_linger(osd, lreq);
2955 }
2956 }
2957
2958 return ct_res;
2959}
2960
2008/* 2961/*
2009 * Requeue requests whose mapping to an OSD has changed. If requests map to 2962 * Requeue requests whose mapping to an OSD has changed.
2010 * no osd, request a new map.
2011 *
2012 * Caller should hold map_sem for read.
2013 */ 2963 */
2014static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, 2964static void scan_requests(struct ceph_osd *osd,
2015 bool force_resend_writes) 2965 bool force_resend,
2966 bool cleared_full,
2967 bool check_pool_cleared_full,
2968 struct rb_root *need_resend,
2969 struct list_head *need_resend_linger)
2016{ 2970{
2017 struct ceph_osd_request *req, *nreq; 2971 struct ceph_osd_client *osdc = osd->o_osdc;
2018 struct rb_node *p; 2972 struct rb_node *n;
2019 int needmap = 0; 2973 bool force_resend_writes;
2020 int err; 2974
2021 bool force_resend_req; 2975 for (n = rb_first(&osd->o_linger_requests); n; ) {
2976 struct ceph_osd_linger_request *lreq =
2977 rb_entry(n, struct ceph_osd_linger_request, node);
2978 enum calc_target_result ct_res;
2979
2980 n = rb_next(n); /* recalc_linger_target() */
2981
2982 dout("%s lreq %p linger_id %llu\n", __func__, lreq,
2983 lreq->linger_id);
2984 ct_res = recalc_linger_target(lreq);
2985 switch (ct_res) {
2986 case CALC_TARGET_NO_ACTION:
2987 force_resend_writes = cleared_full ||
2988 (check_pool_cleared_full &&
2989 pool_cleared_full(osdc, lreq->t.base_oloc.pool));
2990 if (!force_resend && !force_resend_writes)
2991 break;
2992
2993 /* fall through */
2994 case CALC_TARGET_NEED_RESEND:
2995 cancel_linger_map_check(lreq);
2996 /*
2997 * scan_requests() for the previous epoch(s)
2998 * may have already added it to the list, since
2999 * it's not unlinked here.
3000 */
3001 if (list_empty(&lreq->scan_item))
3002 list_add_tail(&lreq->scan_item, need_resend_linger);
3003 break;
3004 case CALC_TARGET_POOL_DNE:
3005 check_linger_pool_dne(lreq);
3006 break;
3007 }
3008 }
2022 3009
2023 dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "", 3010 for (n = rb_first(&osd->o_requests); n; ) {
2024 force_resend_writes ? " (force resend writes)" : ""); 3011 struct ceph_osd_request *req =
2025 mutex_lock(&osdc->request_mutex); 3012 rb_entry(n, struct ceph_osd_request, r_node);
2026 for (p = rb_first(&osdc->requests); p; ) { 3013 enum calc_target_result ct_res;
2027 req = rb_entry(p, struct ceph_osd_request, r_node); 3014
2028 p = rb_next(p); 3015 n = rb_next(n); /* unlink_request(), check_pool_dne() */
3016
3017 dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
3018 ct_res = calc_target(osdc, &req->r_t,
3019 &req->r_last_force_resend, false);
3020 switch (ct_res) {
3021 case CALC_TARGET_NO_ACTION:
3022 force_resend_writes = cleared_full ||
3023 (check_pool_cleared_full &&
3024 pool_cleared_full(osdc, req->r_t.base_oloc.pool));
3025 if (!force_resend &&
3026 (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
3027 !force_resend_writes))
3028 break;
3029
3030 /* fall through */
3031 case CALC_TARGET_NEED_RESEND:
3032 cancel_map_check(req);
3033 unlink_request(osd, req);
3034 insert_request(need_resend, req);
3035 break;
3036 case CALC_TARGET_POOL_DNE:
3037 check_pool_dne(req);
3038 break;
3039 }
3040 }
3041}
2029 3042
3043static int handle_one_map(struct ceph_osd_client *osdc,
3044 void *p, void *end, bool incremental,
3045 struct rb_root *need_resend,
3046 struct list_head *need_resend_linger)
3047{
3048 struct ceph_osdmap *newmap;
3049 struct rb_node *n;
3050 bool skipped_map = false;
3051 bool was_full;
3052
3053 was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
3054 set_pool_was_full(osdc);
3055
3056 if (incremental)
3057 newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
3058 else
3059 newmap = ceph_osdmap_decode(&p, end);
3060 if (IS_ERR(newmap))
3061 return PTR_ERR(newmap);
3062
3063 if (newmap != osdc->osdmap) {
2030 /* 3064 /*
2031 * For linger requests that have not yet been 3065 * Preserve ->was_full before destroying the old map.
2032 * registered, move them to the linger list; they'll 3066 * For pools that weren't in the old map, ->was_full
2033 * be sent to the osd in the loop below. Unregister 3067 * should be false.
2034 * the request before re-registering it as a linger
2035 * request to ensure the __map_request() below
2036 * will decide it needs to be sent.
2037 */ 3068 */
2038 if (req->r_linger && list_empty(&req->r_linger_item)) { 3069 for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
2039 dout("%p tid %llu restart on osd%d\n", 3070 struct ceph_pg_pool_info *pi =
2040 req, req->r_tid, 3071 rb_entry(n, struct ceph_pg_pool_info, node);
2041 req->r_osd ? req->r_osd->o_osd : -1); 3072 struct ceph_pg_pool_info *old_pi;
2042 ceph_osdc_get_request(req); 3073
2043 __unregister_request(osdc, req); 3074 old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
2044 __register_linger_request(osdc, req); 3075 if (old_pi)
2045 ceph_osdc_put_request(req); 3076 pi->was_full = old_pi->was_full;
2046 continue; 3077 else
3078 WARN_ON(pi->was_full);
2047 } 3079 }
2048 3080
2049 force_resend_req = force_resend || 3081 if (osdc->osdmap->epoch &&
2050 (force_resend_writes && 3082 osdc->osdmap->epoch + 1 < newmap->epoch) {
2051 req->r_flags & CEPH_OSD_FLAG_WRITE); 3083 WARN_ON(incremental);
2052 err = __map_request(osdc, req, force_resend_req); 3084 skipped_map = true;
2053 if (err < 0)
2054 continue; /* error */
2055 if (req->r_osd == NULL) {
2056 dout("%p tid %llu maps to no osd\n", req, req->r_tid);
2057 needmap++; /* request a newer map */
2058 } else if (err > 0) {
2059 if (!req->r_linger) {
2060 dout("%p tid %llu requeued on osd%d\n", req,
2061 req->r_tid,
2062 req->r_osd ? req->r_osd->o_osd : -1);
2063 req->r_flags |= CEPH_OSD_FLAG_RETRY;
2064 }
2065 } 3085 }
3086
3087 ceph_osdmap_destroy(osdc->osdmap);
3088 osdc->osdmap = newmap;
2066 } 3089 }
2067 3090
2068 list_for_each_entry_safe(req, nreq, &osdc->req_linger, 3091 was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
2069 r_linger_item) { 3092 scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
2070 dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); 3093 need_resend, need_resend_linger);
2071 3094
2072 err = __map_request(osdc, req, 3095 for (n = rb_first(&osdc->osds); n; ) {
2073 force_resend || force_resend_writes); 3096 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
2074 dout("__map_request returned %d\n", err); 3097
2075 if (err < 0) 3098 n = rb_next(n); /* close_osd() */
2076 continue; /* hrm! */ 3099
2077 if (req->r_osd == NULL || err > 0) { 3100 scan_requests(osd, skipped_map, was_full, true, need_resend,
2078 if (req->r_osd == NULL) { 3101 need_resend_linger);
2079 dout("lingering %p tid %llu maps to no osd\n", 3102 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
2080 req, req->r_tid); 3103 memcmp(&osd->o_con.peer_addr,
2081 /* 3104 ceph_osd_addr(osdc->osdmap, osd->o_osd),
2082 * A homeless lingering request makes 3105 sizeof(struct ceph_entity_addr)))
2083 * no sense, as it's job is to keep 3106 close_osd(osd);
2084 * a particular OSD connection open. 3107 }
2085 * Request a newer map and kick the
2086 * request, knowing that it won't be
2087 * resent until we actually get a map
2088 * that can tell us where to send it.
2089 */
2090 needmap++;
2091 }
2092 3108
2093 dout("kicking lingering %p tid %llu osd%d\n", req, 3109 return 0;
2094 req->r_tid, req->r_osd ? req->r_osd->o_osd : -1); 3110}
2095 __register_request(osdc, req); 3111
2096 __unregister_linger_request(osdc, req); 3112static void kick_requests(struct ceph_osd_client *osdc,
3113 struct rb_root *need_resend,
3114 struct list_head *need_resend_linger)
3115{
3116 struct ceph_osd_linger_request *lreq, *nlreq;
3117 struct rb_node *n;
3118
3119 for (n = rb_first(need_resend); n; ) {
3120 struct ceph_osd_request *req =
3121 rb_entry(n, struct ceph_osd_request, r_node);
3122 struct ceph_osd *osd;
3123
3124 n = rb_next(n);
3125 erase_request(need_resend, req); /* before link_request() */
3126
3127 WARN_ON(req->r_osd);
3128 calc_target(osdc, &req->r_t, NULL, false);
3129 osd = lookup_create_osd(osdc, req->r_t.osd, true);
3130 link_request(osd, req);
3131 if (!req->r_linger) {
3132 if (!osd_homeless(osd) && !req->r_t.paused)
3133 send_request(req);
3134 } else {
3135 cancel_linger_request(req);
2097 } 3136 }
2098 } 3137 }
2099 reset_changed_osds(osdc);
2100 mutex_unlock(&osdc->request_mutex);
2101 3138
2102 if (needmap) { 3139 list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
2103 dout("%d requests for down osds, need new map\n", needmap); 3140 if (!osd_homeless(lreq->osd))
2104 ceph_monc_request_next_osdmap(&osdc->client->monc); 3141 send_linger(lreq);
3142
3143 list_del_init(&lreq->scan_item);
2105 } 3144 }
2106} 3145}
2107 3146
2108
2109/* 3147/*
2110 * Process updated osd map. 3148 * Process updated osd map.
2111 * 3149 *
@@ -2115,27 +3153,31 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
2115 */ 3153 */
2116void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) 3154void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
2117{ 3155{
2118 void *p, *end, *next; 3156 void *p = msg->front.iov_base;
3157 void *const end = p + msg->front.iov_len;
2119 u32 nr_maps, maplen; 3158 u32 nr_maps, maplen;
2120 u32 epoch; 3159 u32 epoch;
2121 struct ceph_osdmap *newmap = NULL, *oldmap;
2122 int err;
2123 struct ceph_fsid fsid; 3160 struct ceph_fsid fsid;
2124 bool was_full; 3161 struct rb_root need_resend = RB_ROOT;
3162 LIST_HEAD(need_resend_linger);
3163 bool handled_incremental = false;
3164 bool was_pauserd, was_pausewr;
3165 bool pauserd, pausewr;
3166 int err;
2125 3167
2126 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); 3168 dout("%s have %u\n", __func__, osdc->osdmap->epoch);
2127 p = msg->front.iov_base; 3169 down_write(&osdc->lock);
2128 end = p + msg->front.iov_len;
2129 3170
2130 /* verify fsid */ 3171 /* verify fsid */
2131 ceph_decode_need(&p, end, sizeof(fsid), bad); 3172 ceph_decode_need(&p, end, sizeof(fsid), bad);
2132 ceph_decode_copy(&p, &fsid, sizeof(fsid)); 3173 ceph_decode_copy(&p, &fsid, sizeof(fsid));
2133 if (ceph_check_fsid(osdc->client, &fsid) < 0) 3174 if (ceph_check_fsid(osdc->client, &fsid) < 0)
2134 return; 3175 goto bad;
2135 3176
2136 down_write(&osdc->map_sem); 3177 was_pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
2137 3178 was_pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
2138 was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); 3179 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
3180 have_pool_full(osdc);
2139 3181
2140 /* incremental maps */ 3182 /* incremental maps */
2141 ceph_decode_32_safe(&p, end, nr_maps, bad); 3183 ceph_decode_32_safe(&p, end, nr_maps, bad);
@@ -2145,34 +3187,23 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
2145 epoch = ceph_decode_32(&p); 3187 epoch = ceph_decode_32(&p);
2146 maplen = ceph_decode_32(&p); 3188 maplen = ceph_decode_32(&p);
2147 ceph_decode_need(&p, end, maplen, bad); 3189 ceph_decode_need(&p, end, maplen, bad);
2148 next = p + maplen; 3190 if (osdc->osdmap->epoch &&
2149 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) { 3191 osdc->osdmap->epoch + 1 == epoch) {
2150 dout("applying incremental map %u len %d\n", 3192 dout("applying incremental map %u len %d\n",
2151 epoch, maplen); 3193 epoch, maplen);
2152 newmap = osdmap_apply_incremental(&p, next, 3194 err = handle_one_map(osdc, p, p + maplen, true,
2153 osdc->osdmap, 3195 &need_resend, &need_resend_linger);
2154 &osdc->client->msgr); 3196 if (err)
2155 if (IS_ERR(newmap)) {
2156 err = PTR_ERR(newmap);
2157 goto bad; 3197 goto bad;
2158 } 3198 handled_incremental = true;
2159 BUG_ON(!newmap);
2160 if (newmap != osdc->osdmap) {
2161 ceph_osdmap_destroy(osdc->osdmap);
2162 osdc->osdmap = newmap;
2163 }
2164 was_full = was_full ||
2165 ceph_osdmap_flag(osdc->osdmap,
2166 CEPH_OSDMAP_FULL);
2167 kick_requests(osdc, 0, was_full);
2168 } else { 3199 } else {
2169 dout("ignoring incremental map %u len %d\n", 3200 dout("ignoring incremental map %u len %d\n",
2170 epoch, maplen); 3201 epoch, maplen);
2171 } 3202 }
2172 p = next; 3203 p += maplen;
2173 nr_maps--; 3204 nr_maps--;
2174 } 3205 }
2175 if (newmap) 3206 if (handled_incremental)
2176 goto done; 3207 goto done;
2177 3208
2178 /* full maps */ 3209 /* full maps */
@@ -2186,455 +3217,647 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
2186 if (nr_maps > 1) { 3217 if (nr_maps > 1) {
2187 dout("skipping non-latest full map %u len %d\n", 3218 dout("skipping non-latest full map %u len %d\n",
2188 epoch, maplen); 3219 epoch, maplen);
2189 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) { 3220 } else if (osdc->osdmap->epoch >= epoch) {
2190 dout("skipping full map %u len %d, " 3221 dout("skipping full map %u len %d, "
2191 "older than our %u\n", epoch, maplen, 3222 "older than our %u\n", epoch, maplen,
2192 osdc->osdmap->epoch); 3223 osdc->osdmap->epoch);
2193 } else { 3224 } else {
2194 int skipped_map = 0;
2195
2196 dout("taking full map %u len %d\n", epoch, maplen); 3225 dout("taking full map %u len %d\n", epoch, maplen);
2197 newmap = ceph_osdmap_decode(&p, p+maplen); 3226 err = handle_one_map(osdc, p, p + maplen, false,
2198 if (IS_ERR(newmap)) { 3227 &need_resend, &need_resend_linger);
2199 err = PTR_ERR(newmap); 3228 if (err)
2200 goto bad; 3229 goto bad;
2201 }
2202 BUG_ON(!newmap);
2203 oldmap = osdc->osdmap;
2204 osdc->osdmap = newmap;
2205 if (oldmap) {
2206 if (oldmap->epoch + 1 < newmap->epoch)
2207 skipped_map = 1;
2208 ceph_osdmap_destroy(oldmap);
2209 }
2210 was_full = was_full ||
2211 ceph_osdmap_flag(osdc->osdmap,
2212 CEPH_OSDMAP_FULL);
2213 kick_requests(osdc, skipped_map, was_full);
2214 } 3230 }
2215 p += maplen; 3231 p += maplen;
2216 nr_maps--; 3232 nr_maps--;
2217 } 3233 }
2218 3234
2219 if (!osdc->osdmap)
2220 goto bad;
2221done: 3235done:
2222 downgrade_write(&osdc->map_sem);
2223 ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
2224 osdc->osdmap->epoch);
2225
2226 /* 3236 /*
2227 * subscribe to subsequent osdmap updates if full to ensure 3237 * subscribe to subsequent osdmap updates if full to ensure
2228 * we find out when we are no longer full and stop returning 3238 * we find out when we are no longer full and stop returning
2229 * ENOSPC. 3239 * ENOSPC.
2230 */ 3240 */
2231 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || 3241 pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
2232 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || 3242 pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
2233 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) 3243 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
2234 ceph_monc_request_next_osdmap(&osdc->client->monc); 3244 have_pool_full(osdc);
2235 3245 if (was_pauserd || was_pausewr || pauserd || pausewr)
2236 mutex_lock(&osdc->request_mutex); 3246 maybe_request_map(osdc);
2237 __send_queued(osdc); 3247
2238 mutex_unlock(&osdc->request_mutex); 3248 kick_requests(osdc, &need_resend, &need_resend_linger);
2239 up_read(&osdc->map_sem); 3249
3250 ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
3251 osdc->osdmap->epoch);
3252 up_write(&osdc->lock);
2240 wake_up_all(&osdc->client->auth_wq); 3253 wake_up_all(&osdc->client->auth_wq);
2241 return; 3254 return;
2242 3255
2243bad: 3256bad:
2244 pr_err("osdc handle_map corrupt msg\n"); 3257 pr_err("osdc handle_map corrupt msg\n");
2245 ceph_msg_dump(msg); 3258 ceph_msg_dump(msg);
2246 up_write(&osdc->map_sem); 3259 up_write(&osdc->lock);
2247} 3260}
2248 3261
2249/* 3262/*
2250 * watch/notify callback event infrastructure 3263 * Resubmit requests pending on the given osd.
2251 *
2252 * These callbacks are used both for watch and notify operations.
2253 */ 3264 */
2254static void __release_event(struct kref *kref) 3265static void kick_osd_requests(struct ceph_osd *osd)
2255{ 3266{
2256 struct ceph_osd_event *event = 3267 struct rb_node *n;
2257 container_of(kref, struct ceph_osd_event, kref);
2258 3268
2259 dout("__release_event %p\n", event); 3269 for (n = rb_first(&osd->o_requests); n; ) {
2260 kfree(event); 3270 struct ceph_osd_request *req =
2261} 3271 rb_entry(n, struct ceph_osd_request, r_node);
2262 3272
2263static void get_event(struct ceph_osd_event *event) 3273 n = rb_next(n); /* cancel_linger_request() */
2264{
2265 kref_get(&event->kref);
2266}
2267 3274
2268void ceph_osdc_put_event(struct ceph_osd_event *event) 3275 if (!req->r_linger) {
2269{ 3276 if (!req->r_t.paused)
2270 kref_put(&event->kref, __release_event); 3277 send_request(req);
3278 } else {
3279 cancel_linger_request(req);
3280 }
3281 }
3282 for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
3283 struct ceph_osd_linger_request *lreq =
3284 rb_entry(n, struct ceph_osd_linger_request, node);
3285
3286 send_linger(lreq);
3287 }
2271} 3288}
2272EXPORT_SYMBOL(ceph_osdc_put_event);
2273 3289
2274static void __insert_event(struct ceph_osd_client *osdc, 3290/*
2275 struct ceph_osd_event *new) 3291 * If the osd connection drops, we need to resubmit all requests.
3292 */
3293static void osd_fault(struct ceph_connection *con)
2276{ 3294{
2277 struct rb_node **p = &osdc->event_tree.rb_node; 3295 struct ceph_osd *osd = con->private;
2278 struct rb_node *parent = NULL; 3296 struct ceph_osd_client *osdc = osd->o_osdc;
2279 struct ceph_osd_event *event = NULL;
2280 3297
2281 while (*p) { 3298 dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
2282 parent = *p; 3299
2283 event = rb_entry(parent, struct ceph_osd_event, node); 3300 down_write(&osdc->lock);
2284 if (new->cookie < event->cookie) 3301 if (!osd_registered(osd)) {
2285 p = &(*p)->rb_left; 3302 dout("%s osd%d unknown\n", __func__, osd->o_osd);
2286 else if (new->cookie > event->cookie) 3303 goto out_unlock;
2287 p = &(*p)->rb_right;
2288 else
2289 BUG();
2290 } 3304 }
2291 3305
2292 rb_link_node(&new->node, parent, p); 3306 if (!reopen_osd(osd))
2293 rb_insert_color(&new->node, &osdc->event_tree); 3307 kick_osd_requests(osd);
3308 maybe_request_map(osdc);
3309
3310out_unlock:
3311 up_write(&osdc->lock);
2294} 3312}
2295 3313
2296static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, 3314/*
2297 u64 cookie) 3315 * Process osd watch notifications
3316 */
3317static void handle_watch_notify(struct ceph_osd_client *osdc,
3318 struct ceph_msg *msg)
2298{ 3319{
2299 struct rb_node **p = &osdc->event_tree.rb_node; 3320 void *p = msg->front.iov_base;
2300 struct rb_node *parent = NULL; 3321 void *const end = p + msg->front.iov_len;
2301 struct ceph_osd_event *event = NULL; 3322 struct ceph_osd_linger_request *lreq;
3323 struct linger_work *lwork;
3324 u8 proto_ver, opcode;
3325 u64 cookie, notify_id;
3326 u64 notifier_id = 0;
3327 s32 return_code = 0;
3328 void *payload = NULL;
3329 u32 payload_len = 0;
2302 3330
2303 while (*p) { 3331 ceph_decode_8_safe(&p, end, proto_ver, bad);
2304 parent = *p; 3332 ceph_decode_8_safe(&p, end, opcode, bad);
2305 event = rb_entry(parent, struct ceph_osd_event, node); 3333 ceph_decode_64_safe(&p, end, cookie, bad);
2306 if (cookie < event->cookie) 3334 p += 8; /* skip ver */
2307 p = &(*p)->rb_left; 3335 ceph_decode_64_safe(&p, end, notify_id, bad);
2308 else if (cookie > event->cookie) 3336
2309 p = &(*p)->rb_right; 3337 if (proto_ver >= 1) {
2310 else 3338 ceph_decode_32_safe(&p, end, payload_len, bad);
2311 return event; 3339 ceph_decode_need(&p, end, payload_len, bad);
3340 payload = p;
3341 p += payload_len;
2312 } 3342 }
2313 return NULL;
2314}
2315 3343
2316static void __remove_event(struct ceph_osd_event *event) 3344 if (le16_to_cpu(msg->hdr.version) >= 2)
2317{ 3345 ceph_decode_32_safe(&p, end, return_code, bad);
2318 struct ceph_osd_client *osdc = event->osdc;
2319 3346
2320 if (!RB_EMPTY_NODE(&event->node)) { 3347 if (le16_to_cpu(msg->hdr.version) >= 3)
2321 dout("__remove_event removed %p\n", event); 3348 ceph_decode_64_safe(&p, end, notifier_id, bad);
2322 rb_erase(&event->node, &osdc->event_tree); 3349
2323 ceph_osdc_put_event(event); 3350 down_read(&osdc->lock);
3351 lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
3352 if (!lreq) {
3353 dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
3354 cookie);
3355 goto out_unlock_osdc;
3356 }
3357
3358 mutex_lock(&lreq->lock);
3359 dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
3360 opcode, cookie, lreq, lreq->is_watch);
3361 if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
3362 if (!lreq->last_error) {
3363 lreq->last_error = -ENOTCONN;
3364 queue_watch_error(lreq);
3365 }
3366 } else if (!lreq->is_watch) {
3367 /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
3368 if (lreq->notify_id && lreq->notify_id != notify_id) {
3369 dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
3370 lreq->notify_id, notify_id);
3371 } else if (!completion_done(&lreq->notify_finish_wait)) {
3372 struct ceph_msg_data *data =
3373 list_first_entry_or_null(&msg->data,
3374 struct ceph_msg_data,
3375 links);
3376
3377 if (data) {
3378 if (lreq->preply_pages) {
3379 WARN_ON(data->type !=
3380 CEPH_MSG_DATA_PAGES);
3381 *lreq->preply_pages = data->pages;
3382 *lreq->preply_len = data->length;
3383 } else {
3384 ceph_release_page_vector(data->pages,
3385 calc_pages_for(0, data->length));
3386 }
3387 }
3388 lreq->notify_finish_error = return_code;
3389 complete_all(&lreq->notify_finish_wait);
3390 }
2324 } else { 3391 } else {
2325 dout("__remove_event didn't remove %p\n", event); 3392 /* CEPH_WATCH_EVENT_NOTIFY */
3393 lwork = lwork_alloc(lreq, do_watch_notify);
3394 if (!lwork) {
3395 pr_err("failed to allocate notify-lwork\n");
3396 goto out_unlock_lreq;
3397 }
3398
3399 lwork->notify.notify_id = notify_id;
3400 lwork->notify.notifier_id = notifier_id;
3401 lwork->notify.payload = payload;
3402 lwork->notify.payload_len = payload_len;
3403 lwork->notify.msg = ceph_msg_get(msg);
3404 lwork_queue(lwork);
2326 } 3405 }
3406
3407out_unlock_lreq:
3408 mutex_unlock(&lreq->lock);
3409out_unlock_osdc:
3410 up_read(&osdc->lock);
3411 return;
3412
3413bad:
3414 pr_err("osdc handle_watch_notify corrupt msg\n");
2327} 3415}
2328 3416
2329int ceph_osdc_create_event(struct ceph_osd_client *osdc, 3417/*
2330 void (*event_cb)(u64, u64, u8, void *), 3418 * Register request, send initial attempt.
2331 void *data, struct ceph_osd_event **pevent) 3419 */
3420int ceph_osdc_start_request(struct ceph_osd_client *osdc,
3421 struct ceph_osd_request *req,
3422 bool nofail)
2332{ 3423{
2333 struct ceph_osd_event *event; 3424 down_read(&osdc->lock);
2334 3425 submit_request(req, false);
2335 event = kmalloc(sizeof(*event), GFP_NOIO); 3426 up_read(&osdc->lock);
2336 if (!event)
2337 return -ENOMEM;
2338 3427
2339 dout("create_event %p\n", event);
2340 event->cb = event_cb;
2341 event->one_shot = 0;
2342 event->data = data;
2343 event->osdc = osdc;
2344 INIT_LIST_HEAD(&event->osd_node);
2345 RB_CLEAR_NODE(&event->node);
2346 kref_init(&event->kref); /* one ref for us */
2347 kref_get(&event->kref); /* one ref for the caller */
2348
2349 spin_lock(&osdc->event_lock);
2350 event->cookie = ++osdc->event_count;
2351 __insert_event(osdc, event);
2352 spin_unlock(&osdc->event_lock);
2353
2354 *pevent = event;
2355 return 0; 3428 return 0;
2356} 3429}
2357EXPORT_SYMBOL(ceph_osdc_create_event); 3430EXPORT_SYMBOL(ceph_osdc_start_request);
2358 3431
2359void ceph_osdc_cancel_event(struct ceph_osd_event *event) 3432/*
3433 * Unregister a registered request. The request is not completed (i.e.
3434 * no callbacks or wakeups) - higher layers are supposed to know what
3435 * they are canceling.
3436 */
3437void ceph_osdc_cancel_request(struct ceph_osd_request *req)
2360{ 3438{
2361 struct ceph_osd_client *osdc = event->osdc; 3439 struct ceph_osd_client *osdc = req->r_osdc;
2362 3440
2363 dout("cancel_event %p\n", event); 3441 down_write(&osdc->lock);
2364 spin_lock(&osdc->event_lock); 3442 if (req->r_osd)
2365 __remove_event(event); 3443 cancel_request(req);
2366 spin_unlock(&osdc->event_lock); 3444 up_write(&osdc->lock);
2367 ceph_osdc_put_event(event); /* caller's */
2368} 3445}
2369EXPORT_SYMBOL(ceph_osdc_cancel_event); 3446EXPORT_SYMBOL(ceph_osdc_cancel_request);
2370
2371 3447
2372static void do_event_work(struct work_struct *work) 3448/*
3449 * @timeout: in jiffies, 0 means "wait forever"
3450 */
3451static int wait_request_timeout(struct ceph_osd_request *req,
3452 unsigned long timeout)
2373{ 3453{
2374 struct ceph_osd_event_work *event_work = 3454 long left;
2375 container_of(work, struct ceph_osd_event_work, work);
2376 struct ceph_osd_event *event = event_work->event;
2377 u64 ver = event_work->ver;
2378 u64 notify_id = event_work->notify_id;
2379 u8 opcode = event_work->opcode;
2380 3455
2381 dout("do_event_work completing %p\n", event); 3456 dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
2382 event->cb(ver, notify_id, opcode, event->data); 3457 left = wait_for_completion_killable_timeout(&req->r_completion,
2383 dout("do_event_work completed %p\n", event); 3458 ceph_timeout_jiffies(timeout));
2384 ceph_osdc_put_event(event); 3459 if (left <= 0) {
2385 kfree(event_work); 3460 left = left ?: -ETIMEDOUT;
3461 ceph_osdc_cancel_request(req);
3462
3463 /* kludge - need to to wake ceph_osdc_sync() */
3464 complete_all(&req->r_safe_completion);
3465 } else {
3466 left = req->r_result; /* completed */
3467 }
3468
3469 return left;
2386} 3470}
2387 3471
3472/*
3473 * wait for a request to complete
3474 */
3475int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
3476 struct ceph_osd_request *req)
3477{
3478 return wait_request_timeout(req, 0);
3479}
3480EXPORT_SYMBOL(ceph_osdc_wait_request);
2388 3481
2389/* 3482/*
2390 * Process osd watch notifications 3483 * sync - wait for all in-flight requests to flush. avoid starvation.
2391 */ 3484 */
2392static void handle_watch_notify(struct ceph_osd_client *osdc, 3485void ceph_osdc_sync(struct ceph_osd_client *osdc)
2393 struct ceph_msg *msg)
2394{ 3486{
2395 void *p, *end; 3487 struct rb_node *n, *p;
2396 u8 proto_ver; 3488 u64 last_tid = atomic64_read(&osdc->last_tid);
2397 u64 cookie, ver, notify_id;
2398 u8 opcode;
2399 struct ceph_osd_event *event;
2400 struct ceph_osd_event_work *event_work;
2401 3489
2402 p = msg->front.iov_base; 3490again:
2403 end = p + msg->front.iov_len; 3491 down_read(&osdc->lock);
3492 for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
3493 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
2404 3494
2405 ceph_decode_8_safe(&p, end, proto_ver, bad); 3495 mutex_lock(&osd->lock);
2406 ceph_decode_8_safe(&p, end, opcode, bad); 3496 for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
2407 ceph_decode_64_safe(&p, end, cookie, bad); 3497 struct ceph_osd_request *req =
2408 ceph_decode_64_safe(&p, end, ver, bad); 3498 rb_entry(p, struct ceph_osd_request, r_node);
2409 ceph_decode_64_safe(&p, end, notify_id, bad); 3499
3500 if (req->r_tid > last_tid)
3501 break;
3502
3503 if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
3504 continue;
2410 3505
2411 spin_lock(&osdc->event_lock); 3506 ceph_osdc_get_request(req);
2412 event = __find_event(osdc, cookie); 3507 mutex_unlock(&osd->lock);
2413 if (event) { 3508 up_read(&osdc->lock);
2414 BUG_ON(event->one_shot); 3509 dout("%s waiting on req %p tid %llu last_tid %llu\n",
2415 get_event(event); 3510 __func__, req, req->r_tid, last_tid);
2416 } 3511 wait_for_completion(&req->r_safe_completion);
2417 spin_unlock(&osdc->event_lock); 3512 ceph_osdc_put_request(req);
2418 dout("handle_watch_notify cookie %lld ver %lld event %p\n", 3513 goto again;
2419 cookie, ver, event);
2420 if (event) {
2421 event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
2422 if (!event_work) {
2423 pr_err("couldn't allocate event_work\n");
2424 ceph_osdc_put_event(event);
2425 return;
2426 } 3514 }
2427 INIT_WORK(&event_work->work, do_event_work);
2428 event_work->event = event;
2429 event_work->ver = ver;
2430 event_work->notify_id = notify_id;
2431 event_work->opcode = opcode;
2432 3515
2433 queue_work(osdc->notify_wq, &event_work->work); 3516 mutex_unlock(&osd->lock);
2434 } 3517 }
2435 3518
2436 return; 3519 up_read(&osdc->lock);
3520 dout("%s done last_tid %llu\n", __func__, last_tid);
3521}
3522EXPORT_SYMBOL(ceph_osdc_sync);
2437 3523
2438bad: 3524static struct ceph_osd_request *
2439 pr_err("osdc handle_watch_notify corrupt msg\n"); 3525alloc_linger_request(struct ceph_osd_linger_request *lreq)
3526{
3527 struct ceph_osd_request *req;
3528
3529 req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
3530 if (!req)
3531 return NULL;
3532
3533 ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
3534 ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
3535
3536 if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
3537 ceph_osdc_put_request(req);
3538 return NULL;
3539 }
3540
3541 return req;
2440} 3542}
2441 3543
2442/* 3544/*
2443 * build new request AND message 3545 * Returns a handle, caller owns a ref.
2444 *
2445 */ 3546 */
2446void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, 3547struct ceph_osd_linger_request *
2447 struct ceph_snap_context *snapc, u64 snap_id, 3548ceph_osdc_watch(struct ceph_osd_client *osdc,
2448 struct timespec *mtime) 3549 struct ceph_object_id *oid,
2449{ 3550 struct ceph_object_locator *oloc,
2450 struct ceph_msg *msg = req->r_request; 3551 rados_watchcb2_t wcb,
2451 void *p; 3552 rados_watcherrcb_t errcb,
2452 size_t msg_size; 3553 void *data)
2453 int flags = req->r_flags; 3554{
2454 u64 data_len; 3555 struct ceph_osd_linger_request *lreq;
2455 unsigned int i; 3556 int ret;
2456
2457 req->r_snapid = snap_id;
2458 req->r_snapc = ceph_get_snap_context(snapc);
2459
2460 /* encode request */
2461 msg->hdr.version = cpu_to_le16(4);
2462
2463 p = msg->front.iov_base;
2464 ceph_encode_32(&p, 1); /* client_inc is always 1 */
2465 req->r_request_osdmap_epoch = p;
2466 p += 4;
2467 req->r_request_flags = p;
2468 p += 4;
2469 if (req->r_flags & CEPH_OSD_FLAG_WRITE)
2470 ceph_encode_timespec(p, mtime);
2471 p += sizeof(struct ceph_timespec);
2472 req->r_request_reassert_version = p;
2473 p += sizeof(struct ceph_eversion); /* will get filled in */
2474
2475 /* oloc */
2476 ceph_encode_8(&p, 4);
2477 ceph_encode_8(&p, 4);
2478 ceph_encode_32(&p, 8 + 4 + 4);
2479 req->r_request_pool = p;
2480 p += 8;
2481 ceph_encode_32(&p, -1); /* preferred */
2482 ceph_encode_32(&p, 0); /* key len */
2483 3557
2484 ceph_encode_8(&p, 1); 3558 lreq = linger_alloc(osdc);
2485 req->r_request_pgid = p; 3559 if (!lreq)
2486 p += 8 + 4; 3560 return ERR_PTR(-ENOMEM);
2487 ceph_encode_32(&p, -1); /* preferred */
2488 3561
2489 /* oid */ 3562 lreq->is_watch = true;
2490 ceph_encode_32(&p, req->r_base_oid.name_len); 3563 lreq->wcb = wcb;
2491 memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len); 3564 lreq->errcb = errcb;
2492 dout("oid '%.*s' len %d\n", req->r_base_oid.name_len, 3565 lreq->data = data;
2493 req->r_base_oid.name, req->r_base_oid.name_len); 3566 lreq->watch_valid_thru = jiffies;
2494 p += req->r_base_oid.name_len; 3567
2495 3568 ceph_oid_copy(&lreq->t.base_oid, oid);
2496 /* ops--can imply data */ 3569 ceph_oloc_copy(&lreq->t.base_oloc, oloc);
2497 ceph_encode_16(&p, (u16)req->r_num_ops); 3570 lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
2498 data_len = 0; 3571 lreq->mtime = CURRENT_TIME;
2499 for (i = 0; i < req->r_num_ops; i++) { 3572
2500 data_len += osd_req_encode_op(req, p, i); 3573 lreq->reg_req = alloc_linger_request(lreq);
2501 p += sizeof(struct ceph_osd_op); 3574 if (!lreq->reg_req) {
3575 ret = -ENOMEM;
3576 goto err_put_lreq;
2502 } 3577 }
2503 3578
2504 /* snaps */ 3579 lreq->ping_req = alloc_linger_request(lreq);
2505 ceph_encode_64(&p, req->r_snapid); 3580 if (!lreq->ping_req) {
2506 ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); 3581 ret = -ENOMEM;
2507 ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); 3582 goto err_put_lreq;
2508 if (req->r_snapc) {
2509 for (i = 0; i < snapc->num_snaps; i++) {
2510 ceph_encode_64(&p, req->r_snapc->snaps[i]);
2511 }
2512 } 3583 }
2513 3584
2514 req->r_request_attempts = p; 3585 down_write(&osdc->lock);
2515 p += 4; 3586 linger_register(lreq); /* before osd_req_op_* */
2516 3587 osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
2517 /* data */ 3588 CEPH_OSD_WATCH_OP_WATCH);
2518 if (flags & CEPH_OSD_FLAG_WRITE) { 3589 osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
2519 u16 data_off; 3590 CEPH_OSD_WATCH_OP_PING);
2520 3591 linger_submit(lreq);
2521 /* 3592 up_write(&osdc->lock);
2522 * The header "data_off" is a hint to the receiver 3593
2523 * allowing it to align received data into its 3594 ret = linger_reg_commit_wait(lreq);
2524 * buffers such that there's no need to re-copy 3595 if (ret) {
2525 * it before writing it to disk (direct I/O). 3596 linger_cancel(lreq);
2526 */ 3597 goto err_put_lreq;
2527 data_off = (u16) (off & 0xffff);
2528 req->r_request->hdr.data_off = cpu_to_le16(data_off);
2529 } 3598 }
2530 req->r_request->hdr.data_len = cpu_to_le32(data_len);
2531 3599
2532 BUG_ON(p > msg->front.iov_base + msg->front.iov_len); 3600 return lreq;
2533 msg_size = p - msg->front.iov_base;
2534 msg->front.iov_len = msg_size;
2535 msg->hdr.front_len = cpu_to_le32(msg_size);
2536 3601
2537 dout("build_request msg_size was %d\n", (int)msg_size); 3602err_put_lreq:
3603 linger_put(lreq);
3604 return ERR_PTR(ret);
2538} 3605}
2539EXPORT_SYMBOL(ceph_osdc_build_request); 3606EXPORT_SYMBOL(ceph_osdc_watch);
2540 3607
2541/* 3608/*
2542 * Register request, send initial attempt. 3609 * Releases a ref.
3610 *
3611 * Times out after mount_timeout to preserve rbd unmap behaviour
3612 * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
3613 * with mount_timeout").
2543 */ 3614 */
2544int ceph_osdc_start_request(struct ceph_osd_client *osdc, 3615int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
2545 struct ceph_osd_request *req, 3616 struct ceph_osd_linger_request *lreq)
2546 bool nofail)
2547{ 3617{
2548 int rc; 3618 struct ceph_options *opts = osdc->client->options;
3619 struct ceph_osd_request *req;
3620 int ret;
2549 3621
2550 down_read(&osdc->map_sem); 3622 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
2551 mutex_lock(&osdc->request_mutex); 3623 if (!req)
3624 return -ENOMEM;
3625
3626 ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
3627 ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
3628 req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
3629 req->r_mtime = CURRENT_TIME;
3630 osd_req_op_watch_init(req, 0, lreq->linger_id,
3631 CEPH_OSD_WATCH_OP_UNWATCH);
2552 3632
2553 rc = __ceph_osdc_start_request(osdc, req, nofail); 3633 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
3634 if (ret)
3635 goto out_put_req;
2554 3636
2555 mutex_unlock(&osdc->request_mutex); 3637 ceph_osdc_start_request(osdc, req, false);
2556 up_read(&osdc->map_sem); 3638 linger_cancel(lreq);
3639 linger_put(lreq);
3640 ret = wait_request_timeout(req, opts->mount_timeout);
2557 3641
2558 return rc; 3642out_put_req:
3643 ceph_osdc_put_request(req);
3644 return ret;
2559} 3645}
2560EXPORT_SYMBOL(ceph_osdc_start_request); 3646EXPORT_SYMBOL(ceph_osdc_unwatch);
2561 3647
2562/* 3648static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
2563 * Unregister a registered request. The request is not completed (i.e. 3649 u64 notify_id, u64 cookie, void *payload,
2564 * no callbacks or wakeups) - higher layers are supposed to know what 3650 size_t payload_len)
2565 * they are canceling.
2566 */
2567void ceph_osdc_cancel_request(struct ceph_osd_request *req)
2568{ 3651{
2569 struct ceph_osd_client *osdc = req->r_osdc; 3652 struct ceph_osd_req_op *op;
3653 struct ceph_pagelist *pl;
3654 int ret;
2570 3655
2571 mutex_lock(&osdc->request_mutex); 3656 op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
2572 if (req->r_linger) 3657
2573 __unregister_linger_request(osdc, req); 3658 pl = kmalloc(sizeof(*pl), GFP_NOIO);
2574 __unregister_request(osdc, req); 3659 if (!pl)
2575 mutex_unlock(&osdc->request_mutex); 3660 return -ENOMEM;
3661
3662 ceph_pagelist_init(pl);
3663 ret = ceph_pagelist_encode_64(pl, notify_id);
3664 ret |= ceph_pagelist_encode_64(pl, cookie);
3665 if (payload) {
3666 ret |= ceph_pagelist_encode_32(pl, payload_len);
3667 ret |= ceph_pagelist_append(pl, payload, payload_len);
3668 } else {
3669 ret |= ceph_pagelist_encode_32(pl, 0);
3670 }
3671 if (ret) {
3672 ceph_pagelist_release(pl);
3673 return -ENOMEM;
3674 }
2576 3675
2577 dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid); 3676 ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
3677 op->indata_len = pl->length;
3678 return 0;
2578} 3679}
2579EXPORT_SYMBOL(ceph_osdc_cancel_request);
2580 3680
2581/* 3681int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
2582 * wait for a request to complete 3682 struct ceph_object_id *oid,
2583 */ 3683 struct ceph_object_locator *oloc,
2584int ceph_osdc_wait_request(struct ceph_osd_client *osdc, 3684 u64 notify_id,
2585 struct ceph_osd_request *req) 3685 u64 cookie,
3686 void *payload,
3687 size_t payload_len)
2586{ 3688{
2587 int rc; 3689 struct ceph_osd_request *req;
3690 int ret;
2588 3691
2589 dout("%s %p tid %llu\n", __func__, req, req->r_tid); 3692 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
3693 if (!req)
3694 return -ENOMEM;
2590 3695
2591 rc = wait_for_completion_interruptible(&req->r_completion); 3696 ceph_oid_copy(&req->r_base_oid, oid);
2592 if (rc < 0) { 3697 ceph_oloc_copy(&req->r_base_oloc, oloc);
2593 dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid); 3698 req->r_flags = CEPH_OSD_FLAG_READ;
2594 ceph_osdc_cancel_request(req); 3699
2595 complete_request(req); 3700 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2596 return rc; 3701 if (ret)
3702 goto out_put_req;
3703
3704 ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
3705 payload_len);
3706 if (ret)
3707 goto out_put_req;
3708
3709 ceph_osdc_start_request(osdc, req, false);
3710 ret = ceph_osdc_wait_request(osdc, req);
3711
3712out_put_req:
3713 ceph_osdc_put_request(req);
3714 return ret;
3715}
3716EXPORT_SYMBOL(ceph_osdc_notify_ack);
3717
3718static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
3719 u64 cookie, u32 prot_ver, u32 timeout,
3720 void *payload, size_t payload_len)
3721{
3722 struct ceph_osd_req_op *op;
3723 struct ceph_pagelist *pl;
3724 int ret;
3725
3726 op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
3727 op->notify.cookie = cookie;
3728
3729 pl = kmalloc(sizeof(*pl), GFP_NOIO);
3730 if (!pl)
3731 return -ENOMEM;
3732
3733 ceph_pagelist_init(pl);
3734 ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
3735 ret |= ceph_pagelist_encode_32(pl, timeout);
3736 ret |= ceph_pagelist_encode_32(pl, payload_len);
3737 ret |= ceph_pagelist_append(pl, payload, payload_len);
3738 if (ret) {
3739 ceph_pagelist_release(pl);
3740 return -ENOMEM;
2597 } 3741 }
2598 3742
2599 dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid, 3743 ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
2600 req->r_result); 3744 op->indata_len = pl->length;
2601 return req->r_result; 3745 return 0;
2602} 3746}
2603EXPORT_SYMBOL(ceph_osdc_wait_request);
2604 3747
2605/* 3748/*
2606 * sync - wait for all in-flight requests to flush. avoid starvation. 3749 * @timeout: in seconds
3750 *
3751 * @preply_{pages,len} are initialized both on success and error.
3752 * The caller is responsible for:
3753 *
3754 * ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
2607 */ 3755 */
2608void ceph_osdc_sync(struct ceph_osd_client *osdc) 3756int ceph_osdc_notify(struct ceph_osd_client *osdc,
3757 struct ceph_object_id *oid,
3758 struct ceph_object_locator *oloc,
3759 void *payload,
3760 size_t payload_len,
3761 u32 timeout,
3762 struct page ***preply_pages,
3763 size_t *preply_len)
2609{ 3764{
2610 struct ceph_osd_request *req; 3765 struct ceph_osd_linger_request *lreq;
2611 u64 last_tid, next_tid = 0; 3766 struct page **pages;
3767 int ret;
2612 3768
2613 mutex_lock(&osdc->request_mutex); 3769 WARN_ON(!timeout);
2614 last_tid = osdc->last_tid; 3770 if (preply_pages) {
2615 while (1) { 3771 *preply_pages = NULL;
2616 req = __lookup_request_ge(osdc, next_tid); 3772 *preply_len = 0;
2617 if (!req) 3773 }
2618 break;
2619 if (req->r_tid > last_tid)
2620 break;
2621 3774
2622 next_tid = req->r_tid + 1; 3775 lreq = linger_alloc(osdc);
2623 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0) 3776 if (!lreq)
2624 continue; 3777 return -ENOMEM;
2625 3778
2626 ceph_osdc_get_request(req); 3779 lreq->preply_pages = preply_pages;
2627 mutex_unlock(&osdc->request_mutex); 3780 lreq->preply_len = preply_len;
2628 dout("sync waiting on tid %llu (last is %llu)\n", 3781
2629 req->r_tid, last_tid); 3782 ceph_oid_copy(&lreq->t.base_oid, oid);
2630 wait_for_completion(&req->r_safe_completion); 3783 ceph_oloc_copy(&lreq->t.base_oloc, oloc);
2631 mutex_lock(&osdc->request_mutex); 3784 lreq->t.flags = CEPH_OSD_FLAG_READ;
2632 ceph_osdc_put_request(req); 3785
3786 lreq->reg_req = alloc_linger_request(lreq);
3787 if (!lreq->reg_req) {
3788 ret = -ENOMEM;
3789 goto out_put_lreq;
2633 } 3790 }
2634 mutex_unlock(&osdc->request_mutex); 3791
2635 dout("sync done (thru tid %llu)\n", last_tid); 3792 /* for notify_id */
3793 pages = ceph_alloc_page_vector(1, GFP_NOIO);
3794 if (IS_ERR(pages)) {
3795 ret = PTR_ERR(pages);
3796 goto out_put_lreq;
3797 }
3798
3799 down_write(&osdc->lock);
3800 linger_register(lreq); /* before osd_req_op_* */
3801 ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
3802 timeout, payload, payload_len);
3803 if (ret) {
3804 linger_unregister(lreq);
3805 up_write(&osdc->lock);
3806 ceph_release_page_vector(pages, 1);
3807 goto out_put_lreq;
3808 }
3809 ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
3810 response_data),
3811 pages, PAGE_SIZE, 0, false, true);
3812 linger_submit(lreq);
3813 up_write(&osdc->lock);
3814
3815 ret = linger_reg_commit_wait(lreq);
3816 if (!ret)
3817 ret = linger_notify_finish_wait(lreq);
3818 else
3819 dout("lreq %p failed to initiate notify %d\n", lreq, ret);
3820
3821 linger_cancel(lreq);
3822out_put_lreq:
3823 linger_put(lreq);
3824 return ret;
3825}
3826EXPORT_SYMBOL(ceph_osdc_notify);
3827
3828/*
3829 * Return the number of milliseconds since the watch was last
3830 * confirmed, or an error. If there is an error, the watch is no
3831 * longer valid, and should be destroyed with ceph_osdc_unwatch().
3832 */
3833int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
3834 struct ceph_osd_linger_request *lreq)
3835{
3836 unsigned long stamp, age;
3837 int ret;
3838
3839 down_read(&osdc->lock);
3840 mutex_lock(&lreq->lock);
3841 stamp = lreq->watch_valid_thru;
3842 if (!list_empty(&lreq->pending_lworks)) {
3843 struct linger_work *lwork =
3844 list_first_entry(&lreq->pending_lworks,
3845 struct linger_work,
3846 pending_item);
3847
3848 if (time_before(lwork->queued_stamp, stamp))
3849 stamp = lwork->queued_stamp;
3850 }
3851 age = jiffies - stamp;
3852 dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
3853 lreq, lreq->linger_id, age, lreq->last_error);
3854 /* we are truncating to msecs, so return a safe upper bound */
3855 ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
3856
3857 mutex_unlock(&lreq->lock);
3858 up_read(&osdc->lock);
3859 return ret;
2636} 3860}
2637EXPORT_SYMBOL(ceph_osdc_sync);
2638 3861
2639/* 3862/*
2640 * Call all pending notify callbacks - for use after a watch is 3863 * Call all pending notify callbacks - for use after a watch is
@@ -2646,6 +3869,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
2646} 3869}
2647EXPORT_SYMBOL(ceph_osdc_flush_notifies); 3870EXPORT_SYMBOL(ceph_osdc_flush_notifies);
2648 3871
3872void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
3873{
3874 down_read(&osdc->lock);
3875 maybe_request_map(osdc);
3876 up_read(&osdc->lock);
3877}
3878EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
2649 3879
2650/* 3880/*
2651 * init, shutdown 3881 * init, shutdown
@@ -2656,43 +3886,35 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
2656 3886
2657 dout("init\n"); 3887 dout("init\n");
2658 osdc->client = client; 3888 osdc->client = client;
2659 osdc->osdmap = NULL; 3889 init_rwsem(&osdc->lock);
2660 init_rwsem(&osdc->map_sem);
2661 init_completion(&osdc->map_waiters);
2662 osdc->last_requested_map = 0;
2663 mutex_init(&osdc->request_mutex);
2664 osdc->last_tid = 0;
2665 osdc->osds = RB_ROOT; 3890 osdc->osds = RB_ROOT;
2666 INIT_LIST_HEAD(&osdc->osd_lru); 3891 INIT_LIST_HEAD(&osdc->osd_lru);
2667 osdc->requests = RB_ROOT; 3892 spin_lock_init(&osdc->osd_lru_lock);
2668 INIT_LIST_HEAD(&osdc->req_lru); 3893 osd_init(&osdc->homeless_osd);
2669 INIT_LIST_HEAD(&osdc->req_unsent); 3894 osdc->homeless_osd.o_osdc = osdc;
2670 INIT_LIST_HEAD(&osdc->req_notarget); 3895 osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
2671 INIT_LIST_HEAD(&osdc->req_linger); 3896 osdc->linger_requests = RB_ROOT;
2672 osdc->num_requests = 0; 3897 osdc->map_checks = RB_ROOT;
3898 osdc->linger_map_checks = RB_ROOT;
2673 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); 3899 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
2674 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); 3900 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
2675 spin_lock_init(&osdc->event_lock);
2676 osdc->event_tree = RB_ROOT;
2677 osdc->event_count = 0;
2678
2679 schedule_delayed_work(&osdc->osds_timeout_work,
2680 round_jiffies_relative(osdc->client->options->osd_idle_ttl));
2681 3901
2682 err = -ENOMEM; 3902 err = -ENOMEM;
3903 osdc->osdmap = ceph_osdmap_alloc();
3904 if (!osdc->osdmap)
3905 goto out;
3906
2683 osdc->req_mempool = mempool_create_slab_pool(10, 3907 osdc->req_mempool = mempool_create_slab_pool(10,
2684 ceph_osd_request_cache); 3908 ceph_osd_request_cache);
2685 if (!osdc->req_mempool) 3909 if (!osdc->req_mempool)
2686 goto out; 3910 goto out_map;
2687 3911
2688 err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, 3912 err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
2689 OSD_OP_FRONT_LEN, 10, true, 3913 PAGE_SIZE, 10, true, "osd_op");
2690 "osd_op");
2691 if (err < 0) 3914 if (err < 0)
2692 goto out_mempool; 3915 goto out_mempool;
2693 err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, 3916 err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
2694 OSD_OPREPLY_FRONT_LEN, 10, true, 3917 PAGE_SIZE, 10, true, "osd_op_reply");
2695 "osd_op_reply");
2696 if (err < 0) 3918 if (err < 0)
2697 goto out_msgpool; 3919 goto out_msgpool;
2698 3920
@@ -2701,6 +3923,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
2701 if (!osdc->notify_wq) 3923 if (!osdc->notify_wq)
2702 goto out_msgpool_reply; 3924 goto out_msgpool_reply;
2703 3925
3926 schedule_delayed_work(&osdc->timeout_work,
3927 osdc->client->options->osd_keepalive_timeout);
3928 schedule_delayed_work(&osdc->osds_timeout_work,
3929 round_jiffies_relative(osdc->client->options->osd_idle_ttl));
3930
2704 return 0; 3931 return 0;
2705 3932
2706out_msgpool_reply: 3933out_msgpool_reply:
@@ -2709,6 +3936,8 @@ out_msgpool:
2709 ceph_msgpool_destroy(&osdc->msgpool_op); 3936 ceph_msgpool_destroy(&osdc->msgpool_op);
2710out_mempool: 3937out_mempool:
2711 mempool_destroy(osdc->req_mempool); 3938 mempool_destroy(osdc->req_mempool);
3939out_map:
3940 ceph_osdmap_destroy(osdc->osdmap);
2712out: 3941out:
2713 return err; 3942 return err;
2714} 3943}
@@ -2719,11 +3948,25 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
2719 destroy_workqueue(osdc->notify_wq); 3948 destroy_workqueue(osdc->notify_wq);
2720 cancel_delayed_work_sync(&osdc->timeout_work); 3949 cancel_delayed_work_sync(&osdc->timeout_work);
2721 cancel_delayed_work_sync(&osdc->osds_timeout_work); 3950 cancel_delayed_work_sync(&osdc->osds_timeout_work);
2722 if (osdc->osdmap) { 3951
2723 ceph_osdmap_destroy(osdc->osdmap); 3952 down_write(&osdc->lock);
2724 osdc->osdmap = NULL; 3953 while (!RB_EMPTY_ROOT(&osdc->osds)) {
3954 struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
3955 struct ceph_osd, o_node);
3956 close_osd(osd);
2725 } 3957 }
2726 remove_all_osds(osdc); 3958 up_write(&osdc->lock);
3959 WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
3960 osd_cleanup(&osdc->homeless_osd);
3961
3962 WARN_ON(!list_empty(&osdc->osd_lru));
3963 WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
3964 WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
3965 WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
3966 WARN_ON(atomic_read(&osdc->num_requests));
3967 WARN_ON(atomic_read(&osdc->num_homeless));
3968
3969 ceph_osdmap_destroy(osdc->osdmap);
2727 mempool_destroy(osdc->req_mempool); 3970 mempool_destroy(osdc->req_mempool);
2728 ceph_msgpool_destroy(&osdc->msgpool_op); 3971 ceph_msgpool_destroy(&osdc->msgpool_op);
2729 ceph_msgpool_destroy(&osdc->msgpool_op_reply); 3972 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
@@ -2752,15 +3995,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
2752 return PTR_ERR(req); 3995 return PTR_ERR(req);
2753 3996
2754 /* it may be a short read due to an object boundary */ 3997 /* it may be a short read due to an object boundary */
2755
2756 osd_req_op_extent_osd_data_pages(req, 0, 3998 osd_req_op_extent_osd_data_pages(req, 0,
2757 pages, *plen, page_align, false, false); 3999 pages, *plen, page_align, false, false);
2758 4000
2759 dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", 4001 dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
2760 off, *plen, *plen, page_align); 4002 off, *plen, *plen, page_align);
2761 4003
2762 ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
2763
2764 rc = ceph_osdc_start_request(osdc, req, false); 4004 rc = ceph_osdc_start_request(osdc, req, false);
2765 if (!rc) 4005 if (!rc)
2766 rc = ceph_osdc_wait_request(osdc, req); 4006 rc = ceph_osdc_wait_request(osdc, req);
@@ -2786,7 +4026,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
2786 int rc = 0; 4026 int rc = 0;
2787 int page_align = off & ~PAGE_MASK; 4027 int page_align = off & ~PAGE_MASK;
2788 4028
2789 BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */
2790 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, 4029 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
2791 CEPH_OSD_OP_WRITE, 4030 CEPH_OSD_OP_WRITE,
2792 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 4031 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
@@ -2800,8 +4039,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
2800 false, false); 4039 false, false);
2801 dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); 4040 dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
2802 4041
2803 ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime); 4042 req->r_mtime = *mtime;
2804
2805 rc = ceph_osdc_start_request(osdc, req, true); 4043 rc = ceph_osdc_start_request(osdc, req, true);
2806 if (!rc) 4044 if (!rc)
2807 rc = ceph_osdc_wait_request(osdc, req); 4045 rc = ceph_osdc_wait_request(osdc, req);
@@ -2841,19 +4079,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup);
2841static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) 4079static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
2842{ 4080{
2843 struct ceph_osd *osd = con->private; 4081 struct ceph_osd *osd = con->private;
2844 struct ceph_osd_client *osdc; 4082 struct ceph_osd_client *osdc = osd->o_osdc;
2845 int type = le16_to_cpu(msg->hdr.type); 4083 int type = le16_to_cpu(msg->hdr.type);
2846 4084
2847 if (!osd)
2848 goto out;
2849 osdc = osd->o_osdc;
2850
2851 switch (type) { 4085 switch (type) {
2852 case CEPH_MSG_OSD_MAP: 4086 case CEPH_MSG_OSD_MAP:
2853 ceph_osdc_handle_map(osdc, msg); 4087 ceph_osdc_handle_map(osdc, msg);
2854 break; 4088 break;
2855 case CEPH_MSG_OSD_OPREPLY: 4089 case CEPH_MSG_OSD_OPREPLY:
2856 handle_reply(osdc, msg); 4090 handle_reply(osd, msg);
2857 break; 4091 break;
2858 case CEPH_MSG_WATCH_NOTIFY: 4092 case CEPH_MSG_WATCH_NOTIFY:
2859 handle_watch_notify(osdc, msg); 4093 handle_watch_notify(osdc, msg);
@@ -2863,7 +4097,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
2863 pr_err("received unknown message type %d %s\n", type, 4097 pr_err("received unknown message type %d %s\n", type,
2864 ceph_msg_type_name(type)); 4098 ceph_msg_type_name(type));
2865 } 4099 }
2866out: 4100
2867 ceph_msg_put(msg); 4101 ceph_msg_put(msg);
2868} 4102}
2869 4103
@@ -2878,21 +4112,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2878{ 4112{
2879 struct ceph_osd *osd = con->private; 4113 struct ceph_osd *osd = con->private;
2880 struct ceph_osd_client *osdc = osd->o_osdc; 4114 struct ceph_osd_client *osdc = osd->o_osdc;
2881 struct ceph_msg *m; 4115 struct ceph_msg *m = NULL;
2882 struct ceph_osd_request *req; 4116 struct ceph_osd_request *req;
2883 int front_len = le32_to_cpu(hdr->front_len); 4117 int front_len = le32_to_cpu(hdr->front_len);
2884 int data_len = le32_to_cpu(hdr->data_len); 4118 int data_len = le32_to_cpu(hdr->data_len);
2885 u64 tid; 4119 u64 tid = le64_to_cpu(hdr->tid);
2886 4120
2887 tid = le64_to_cpu(hdr->tid); 4121 down_read(&osdc->lock);
2888 mutex_lock(&osdc->request_mutex); 4122 if (!osd_registered(osd)) {
2889 req = __lookup_request(osdc, tid); 4123 dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
4124 *skip = 1;
4125 goto out_unlock_osdc;
4126 }
4127 WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
4128
4129 mutex_lock(&osd->lock);
4130 req = lookup_request(&osd->o_requests, tid);
2890 if (!req) { 4131 if (!req) {
2891 dout("%s osd%d tid %llu unknown, skipping\n", __func__, 4132 dout("%s osd%d tid %llu unknown, skipping\n", __func__,
2892 osd->o_osd, tid); 4133 osd->o_osd, tid);
2893 m = NULL;
2894 *skip = 1; 4134 *skip = 1;
2895 goto out; 4135 goto out_unlock_session;
2896 } 4136 }
2897 4137
2898 ceph_msg_revoke_incoming(req->r_reply); 4138 ceph_msg_revoke_incoming(req->r_reply);
@@ -2904,7 +4144,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2904 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, 4144 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
2905 false); 4145 false);
2906 if (!m) 4146 if (!m)
2907 goto out; 4147 goto out_unlock_session;
2908 ceph_msg_put(req->r_reply); 4148 ceph_msg_put(req->r_reply);
2909 req->r_reply = m; 4149 req->r_reply = m;
2910 } 4150 }
@@ -2915,14 +4155,49 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2915 req->r_reply->data_length); 4155 req->r_reply->data_length);
2916 m = NULL; 4156 m = NULL;
2917 *skip = 1; 4157 *skip = 1;
2918 goto out; 4158 goto out_unlock_session;
2919 } 4159 }
2920 4160
2921 m = ceph_msg_get(req->r_reply); 4161 m = ceph_msg_get(req->r_reply);
2922 dout("get_reply tid %lld %p\n", tid, m); 4162 dout("get_reply tid %lld %p\n", tid, m);
2923 4163
2924out: 4164out_unlock_session:
2925 mutex_unlock(&osdc->request_mutex); 4165 mutex_unlock(&osd->lock);
4166out_unlock_osdc:
4167 up_read(&osdc->lock);
4168 return m;
4169}
4170
4171/*
4172 * TODO: switch to a msg-owned pagelist
4173 */
4174static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
4175{
4176 struct ceph_msg *m;
4177 int type = le16_to_cpu(hdr->type);
4178 u32 front_len = le32_to_cpu(hdr->front_len);
4179 u32 data_len = le32_to_cpu(hdr->data_len);
4180
4181 m = ceph_msg_new(type, front_len, GFP_NOIO, false);
4182 if (!m)
4183 return NULL;
4184
4185 if (data_len) {
4186 struct page **pages;
4187 struct ceph_osd_data osd_data;
4188
4189 pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
4190 GFP_NOIO);
4191 if (!pages) {
4192 ceph_msg_put(m);
4193 return NULL;
4194 }
4195
4196 ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
4197 false);
4198 ceph_osdc_msg_data_add(m, &osd_data);
4199 }
4200
2926 return m; 4201 return m;
2927} 4202}
2928 4203
@@ -2932,18 +4207,17 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
2932{ 4207{
2933 struct ceph_osd *osd = con->private; 4208 struct ceph_osd *osd = con->private;
2934 int type = le16_to_cpu(hdr->type); 4209 int type = le16_to_cpu(hdr->type);
2935 int front = le32_to_cpu(hdr->front_len);
2936 4210
2937 *skip = 0; 4211 *skip = 0;
2938 switch (type) { 4212 switch (type) {
2939 case CEPH_MSG_OSD_MAP: 4213 case CEPH_MSG_OSD_MAP:
2940 case CEPH_MSG_WATCH_NOTIFY: 4214 case CEPH_MSG_WATCH_NOTIFY:
2941 return ceph_msg_new(type, front, GFP_NOFS, false); 4215 return alloc_msg_with_page_vector(hdr);
2942 case CEPH_MSG_OSD_OPREPLY: 4216 case CEPH_MSG_OSD_OPREPLY:
2943 return get_reply(con, hdr, skip); 4217 return get_reply(con, hdr, skip);
2944 default: 4218 default:
2945 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type, 4219 pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
2946 osd->o_osd); 4220 osd->o_osd, type);
2947 *skip = 1; 4221 *skip = 1;
2948 return NULL; 4222 return NULL;
2949 } 4223 }
@@ -3047,5 +4321,5 @@ static const struct ceph_connection_operations osd_con_ops = {
3047 .alloc_msg = alloc_msg, 4321 .alloc_msg = alloc_msg,
3048 .sign_message = osd_sign_message, 4322 .sign_message = osd_sign_message,
3049 .check_message_signature = osd_check_message_signature, 4323 .check_message_signature = osd_check_message_signature,
3050 .fault = osd_reset, 4324 .fault = osd_fault,
3051}; 4325};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 243574c8cf33..cde52e94732f 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -380,23 +380,24 @@ bad:
380 return ERR_PTR(err); 380 return ERR_PTR(err);
381} 381}
382 382
383/* 383int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
384 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
385 * to a set of osds) and primary_temp (explicit primary setting)
386 */
387static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
388{ 384{
389 if (l.pool < r.pool) 385 if (lhs->pool < rhs->pool)
390 return -1; 386 return -1;
391 if (l.pool > r.pool) 387 if (lhs->pool > rhs->pool)
392 return 1; 388 return 1;
393 if (l.seed < r.seed) 389 if (lhs->seed < rhs->seed)
394 return -1; 390 return -1;
395 if (l.seed > r.seed) 391 if (lhs->seed > rhs->seed)
396 return 1; 392 return 1;
393
397 return 0; 394 return 0;
398} 395}
399 396
397/*
398 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
399 * to a set of osds) and primary_temp (explicit primary setting)
400 */
400static int __insert_pg_mapping(struct ceph_pg_mapping *new, 401static int __insert_pg_mapping(struct ceph_pg_mapping *new,
401 struct rb_root *root) 402 struct rb_root *root)
402{ 403{
@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
409 while (*p) { 410 while (*p) {
410 parent = *p; 411 parent = *p;
411 pg = rb_entry(parent, struct ceph_pg_mapping, node); 412 pg = rb_entry(parent, struct ceph_pg_mapping, node);
412 c = pgid_cmp(new->pgid, pg->pgid); 413 c = ceph_pg_compare(&new->pgid, &pg->pgid);
413 if (c < 0) 414 if (c < 0)
414 p = &(*p)->rb_left; 415 p = &(*p)->rb_left;
415 else if (c > 0) 416 else if (c > 0)
@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
432 433
433 while (n) { 434 while (n) {
434 pg = rb_entry(n, struct ceph_pg_mapping, node); 435 pg = rb_entry(n, struct ceph_pg_mapping, node);
435 c = pgid_cmp(pgid, pg->pgid); 436 c = ceph_pg_compare(&pgid, &pg->pgid);
436 if (c < 0) { 437 if (c < 0) {
437 n = n->rb_left; 438 n = n->rb_left;
438 } else if (c > 0) { 439 } else if (c > 0) {
@@ -596,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
596 *p += 4; /* skip crash_replay_interval */ 597 *p += 4; /* skip crash_replay_interval */
597 598
598 if (ev >= 7) 599 if (ev >= 7)
599 *p += 1; /* skip min_size */ 600 pi->min_size = ceph_decode_8(p);
601 else
602 pi->min_size = pi->size - pi->size / 2;
600 603
601 if (ev >= 8) 604 if (ev >= 8)
602 *p += 8 + 8; /* skip quota_max_* */ 605 *p += 8 + 8; /* skip quota_max_* */
@@ -616,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
616 pi->write_tier = -1; 619 pi->write_tier = -1;
617 } 620 }
618 621
622 if (ev >= 10) {
623 /* skip properties */
624 num = ceph_decode_32(p);
625 while (num--) {
626 len = ceph_decode_32(p);
627 *p += len; /* key */
628 len = ceph_decode_32(p);
629 *p += len; /* val */
630 }
631 }
632
633 if (ev >= 11) {
634 /* skip hit_set_params */
635 *p += 1 + 1; /* versions */
636 len = ceph_decode_32(p);
637 *p += len;
638
639 *p += 4; /* skip hit_set_period */
640 *p += 4; /* skip hit_set_count */
641 }
642
643 if (ev >= 12)
644 *p += 4; /* skip stripe_width */
645
646 if (ev >= 13) {
647 *p += 8; /* skip target_max_bytes */
648 *p += 8; /* skip target_max_objects */
649 *p += 4; /* skip cache_target_dirty_ratio_micro */
650 *p += 4; /* skip cache_target_full_ratio_micro */
651 *p += 4; /* skip cache_min_flush_age */
652 *p += 4; /* skip cache_min_evict_age */
653 }
654
655 if (ev >= 14) {
656 /* skip erasure_code_profile */
657 len = ceph_decode_32(p);
658 *p += len;
659 }
660
661 if (ev >= 15)
662 pi->last_force_request_resend = ceph_decode_32(p);
663 else
664 pi->last_force_request_resend = 0;
665
619 /* ignore the rest */ 666 /* ignore the rest */
620 667
621 *p = pool_end; 668 *p = pool_end;
@@ -660,6 +707,23 @@ bad:
660/* 707/*
661 * osd map 708 * osd map
662 */ 709 */
710struct ceph_osdmap *ceph_osdmap_alloc(void)
711{
712 struct ceph_osdmap *map;
713
714 map = kzalloc(sizeof(*map), GFP_NOIO);
715 if (!map)
716 return NULL;
717
718 map->pg_pools = RB_ROOT;
719 map->pool_max = -1;
720 map->pg_temp = RB_ROOT;
721 map->primary_temp = RB_ROOT;
722 mutex_init(&map->crush_scratch_mutex);
723
724 return map;
725}
726
663void ceph_osdmap_destroy(struct ceph_osdmap *map) 727void ceph_osdmap_destroy(struct ceph_osdmap *map)
664{ 728{
665 dout("osdmap_destroy %p\n", map); 729 dout("osdmap_destroy %p\n", map);
@@ -1183,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
1183 struct ceph_osdmap *map; 1247 struct ceph_osdmap *map;
1184 int ret; 1248 int ret;
1185 1249
1186 map = kzalloc(sizeof(*map), GFP_NOFS); 1250 map = ceph_osdmap_alloc();
1187 if (!map) 1251 if (!map)
1188 return ERR_PTR(-ENOMEM); 1252 return ERR_PTR(-ENOMEM);
1189 1253
1190 map->pg_temp = RB_ROOT;
1191 map->primary_temp = RB_ROOT;
1192 mutex_init(&map->crush_scratch_mutex);
1193
1194 ret = osdmap_decode(p, end, map); 1254 ret = osdmap_decode(p, end, map);
1195 if (ret) { 1255 if (ret) {
1196 ceph_osdmap_destroy(map); 1256 ceph_osdmap_destroy(map);
@@ -1204,8 +1264,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
1204 * decode and apply an incremental map update. 1264 * decode and apply an incremental map update.
1205 */ 1265 */
1206struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, 1266struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
1207 struct ceph_osdmap *map, 1267 struct ceph_osdmap *map)
1208 struct ceph_messenger *msgr)
1209{ 1268{
1210 struct crush_map *newcrush = NULL; 1269 struct crush_map *newcrush = NULL;
1211 struct ceph_fsid fsid; 1270 struct ceph_fsid fsid;
@@ -1381,8 +1440,252 @@ bad:
1381 return ERR_PTR(err); 1440 return ERR_PTR(err);
1382} 1441}
1383 1442
1443void ceph_oid_copy(struct ceph_object_id *dest,
1444 const struct ceph_object_id *src)
1445{
1446 WARN_ON(!ceph_oid_empty(dest));
1447
1448 if (src->name != src->inline_name) {
1449 /* very rare, see ceph_object_id definition */
1450 dest->name = kmalloc(src->name_len + 1,
1451 GFP_NOIO | __GFP_NOFAIL);
1452 }
1453
1454 memcpy(dest->name, src->name, src->name_len + 1);
1455 dest->name_len = src->name_len;
1456}
1457EXPORT_SYMBOL(ceph_oid_copy);
1458
1459static __printf(2, 0)
1460int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
1461{
1462 int len;
1463
1464 WARN_ON(!ceph_oid_empty(oid));
1465
1466 len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
1467 if (len >= sizeof(oid->inline_name))
1468 return len;
1469
1470 oid->name_len = len;
1471 return 0;
1472}
1473
1474/*
1475 * If oid doesn't fit into inline buffer, BUG.
1476 */
1477void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
1478{
1479 va_list ap;
1480
1481 va_start(ap, fmt);
1482 BUG_ON(oid_printf_vargs(oid, fmt, ap));
1483 va_end(ap);
1484}
1485EXPORT_SYMBOL(ceph_oid_printf);
1486
1487static __printf(3, 0)
1488int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
1489 const char *fmt, va_list ap)
1490{
1491 va_list aq;
1492 int len;
1493
1494 va_copy(aq, ap);
1495 len = oid_printf_vargs(oid, fmt, aq);
1496 va_end(aq);
1497
1498 if (len) {
1499 char *external_name;
1500
1501 external_name = kmalloc(len + 1, gfp);
1502 if (!external_name)
1503 return -ENOMEM;
1504
1505 oid->name = external_name;
1506 WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
1507 oid->name_len = len;
1508 }
1509
1510 return 0;
1511}
1512
1513/*
1514 * If oid doesn't fit into inline buffer, allocate.
1515 */
1516int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
1517 const char *fmt, ...)
1518{
1519 va_list ap;
1520 int ret;
1521
1522 va_start(ap, fmt);
1523 ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
1524 va_end(ap);
1525
1526 return ret;
1527}
1528EXPORT_SYMBOL(ceph_oid_aprintf);
1529
1530void ceph_oid_destroy(struct ceph_object_id *oid)
1531{
1532 if (oid->name != oid->inline_name)
1533 kfree(oid->name);
1534}
1535EXPORT_SYMBOL(ceph_oid_destroy);
1536
1537/*
1538 * osds only
1539 */
1540static bool __osds_equal(const struct ceph_osds *lhs,
1541 const struct ceph_osds *rhs)
1542{
1543 if (lhs->size == rhs->size &&
1544 !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
1545 return true;
1546
1547 return false;
1548}
1549
1550/*
1551 * osds + primary
1552 */
1553static bool osds_equal(const struct ceph_osds *lhs,
1554 const struct ceph_osds *rhs)
1555{
1556 if (__osds_equal(lhs, rhs) &&
1557 lhs->primary == rhs->primary)
1558 return true;
1559
1560 return false;
1561}
1562
1563static bool osds_valid(const struct ceph_osds *set)
1564{
1565 /* non-empty set */
1566 if (set->size > 0 && set->primary >= 0)
1567 return true;
1568
1569 /* empty can_shift_osds set */
1570 if (!set->size && set->primary == -1)
1571 return true;
1572
1573 /* empty !can_shift_osds set - all NONE */
1574 if (set->size > 0 && set->primary == -1) {
1575 int i;
1576
1577 for (i = 0; i < set->size; i++) {
1578 if (set->osds[i] != CRUSH_ITEM_NONE)
1579 break;
1580 }
1581 if (i == set->size)
1582 return true;
1583 }
1584
1585 return false;
1586}
1587
1588void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
1589{
1590 memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
1591 dest->size = src->size;
1592 dest->primary = src->primary;
1593}
1594
1595static bool is_split(const struct ceph_pg *pgid,
1596 u32 old_pg_num,
1597 u32 new_pg_num)
1598{
1599 int old_bits = calc_bits_of(old_pg_num);
1600 int old_mask = (1 << old_bits) - 1;
1601 int n;
1602
1603 WARN_ON(pgid->seed >= old_pg_num);
1604 if (new_pg_num <= old_pg_num)
1605 return false;
1606
1607 for (n = 1; ; n++) {
1608 int next_bit = n << (old_bits - 1);
1609 u32 s = next_bit | pgid->seed;
1610
1611 if (s < old_pg_num || s == pgid->seed)
1612 continue;
1613 if (s >= new_pg_num)
1614 break;
1615
1616 s = ceph_stable_mod(s, old_pg_num, old_mask);
1617 if (s == pgid->seed)
1618 return true;
1619 }
1620
1621 return false;
1622}
1623
1624bool ceph_is_new_interval(const struct ceph_osds *old_acting,
1625 const struct ceph_osds *new_acting,
1626 const struct ceph_osds *old_up,
1627 const struct ceph_osds *new_up,
1628 int old_size,
1629 int new_size,
1630 int old_min_size,
1631 int new_min_size,
1632 u32 old_pg_num,
1633 u32 new_pg_num,
1634 bool old_sort_bitwise,
1635 bool new_sort_bitwise,
1636 const struct ceph_pg *pgid)
1637{
1638 return !osds_equal(old_acting, new_acting) ||
1639 !osds_equal(old_up, new_up) ||
1640 old_size != new_size ||
1641 old_min_size != new_min_size ||
1642 is_split(pgid, old_pg_num, new_pg_num) ||
1643 old_sort_bitwise != new_sort_bitwise;
1644}
1645
1646static int calc_pg_rank(int osd, const struct ceph_osds *acting)
1647{
1648 int i;
1649
1650 for (i = 0; i < acting->size; i++) {
1651 if (acting->osds[i] == osd)
1652 return i;
1653 }
1654
1655 return -1;
1656}
1657
1658static bool primary_changed(const struct ceph_osds *old_acting,
1659 const struct ceph_osds *new_acting)
1660{
1661 if (!old_acting->size && !new_acting->size)
1662 return false; /* both still empty */
1384 1663
1664 if (!old_acting->size ^ !new_acting->size)
1665 return true; /* was empty, now not, or vice versa */
1385 1666
1667 if (old_acting->primary != new_acting->primary)
1668 return true; /* primary changed */
1669
1670 if (calc_pg_rank(old_acting->primary, old_acting) !=
1671 calc_pg_rank(new_acting->primary, new_acting))
1672 return true;
1673
1674 return false; /* same primary (tho replicas may have changed) */
1675}
1676
1677bool ceph_osds_changed(const struct ceph_osds *old_acting,
1678 const struct ceph_osds *new_acting,
1679 bool any_change)
1680{
1681 if (primary_changed(old_acting, new_acting))
1682 return true;
1683
1684 if (any_change && !__osds_equal(old_acting, new_acting))
1685 return true;
1686
1687 return false;
1688}
1386 1689
1387/* 1690/*
1388 * calculate file layout from given offset, length. 1691 * calculate file layout from given offset, length.
@@ -1455,30 +1758,71 @@ invalid:
1455EXPORT_SYMBOL(ceph_calc_file_object_mapping); 1758EXPORT_SYMBOL(ceph_calc_file_object_mapping);
1456 1759
1457/* 1760/*
1458 * Calculate mapping of a (oloc, oid) pair to a PG. Should only be 1761 * Map an object into a PG.
1459 * called with target's (oloc, oid), since tiering isn't taken into 1762 *
1460 * account. 1763 * Should only be called with target_oid and target_oloc (as opposed to
1764 * base_oid and base_oloc), since tiering isn't taken into account.
1461 */ 1765 */
1462int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, 1766int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
1463 struct ceph_object_locator *oloc, 1767 struct ceph_object_id *oid,
1464 struct ceph_object_id *oid, 1768 struct ceph_object_locator *oloc,
1465 struct ceph_pg *pg_out) 1769 struct ceph_pg *raw_pgid)
1466{ 1770{
1467 struct ceph_pg_pool_info *pi; 1771 struct ceph_pg_pool_info *pi;
1468 1772
1469 pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); 1773 pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
1470 if (!pi) 1774 if (!pi)
1471 return -EIO; 1775 return -ENOENT;
1472 1776
1473 pg_out->pool = oloc->pool; 1777 raw_pgid->pool = oloc->pool;
1474 pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, 1778 raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
1475 oid->name_len); 1779 oid->name_len);
1476 1780
1477 dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, 1781 dout("%s %*pE -> raw_pgid %llu.%x\n", __func__, oid->name_len,
1478 pg_out->pool, pg_out->seed); 1782 oid->name, raw_pgid->pool, raw_pgid->seed);
1479 return 0; 1783 return 0;
1480} 1784}
1481EXPORT_SYMBOL(ceph_oloc_oid_to_pg); 1785EXPORT_SYMBOL(ceph_object_locator_to_pg);
1786
1787/*
1788 * Map a raw PG (full precision ps) into an actual PG.
1789 */
1790static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
1791 const struct ceph_pg *raw_pgid,
1792 struct ceph_pg *pgid)
1793{
1794 pgid->pool = raw_pgid->pool;
1795 pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
1796 pi->pg_num_mask);
1797}
1798
1799/*
1800 * Map a raw PG (full precision ps) into a placement ps (placement
1801 * seed). Include pool id in that value so that different pools don't
1802 * use the same seeds.
1803 */
1804static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
1805 const struct ceph_pg *raw_pgid)
1806{
1807 if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
1808 /* hash pool id and seed so that pool PGs do not overlap */
1809 return crush_hash32_2(CRUSH_HASH_RJENKINS1,
1810 ceph_stable_mod(raw_pgid->seed,
1811 pi->pgp_num,
1812 pi->pgp_num_mask),
1813 raw_pgid->pool);
1814 } else {
1815 /*
1816 * legacy behavior: add ps and pool together. this is
1817 * not a great approach because the PGs from each pool
1818 * will overlap on top of each other: 0.5 == 1.4 ==
1819 * 2.3 == ...
1820 */
1821 return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
1822 pi->pgp_num_mask) +
1823 (unsigned)raw_pgid->pool;
1824 }
1825}
1482 1826
1483static int do_crush(struct ceph_osdmap *map, int ruleno, int x, 1827static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
1484 int *result, int result_max, 1828 int *result, int result_max,
@@ -1497,84 +1841,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
1497} 1841}
1498 1842
1499/* 1843/*
1500 * Calculate raw (crush) set for given pgid. 1844 * Calculate raw set (CRUSH output) for given PG. The result may
1845 * contain nonexistent OSDs. ->primary is undefined for a raw set.
1501 * 1846 *
1502 * Return raw set length, or error. 1847 * Placement seed (CRUSH input) is returned through @ppps.
1503 */ 1848 */
1504static int pg_to_raw_osds(struct ceph_osdmap *osdmap, 1849static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
1505 struct ceph_pg_pool_info *pool, 1850 struct ceph_pg_pool_info *pi,
1506 struct ceph_pg pgid, u32 pps, int *osds) 1851 const struct ceph_pg *raw_pgid,
1852 struct ceph_osds *raw,
1853 u32 *ppps)
1507{ 1854{
1855 u32 pps = raw_pg_to_pps(pi, raw_pgid);
1508 int ruleno; 1856 int ruleno;
1509 int len; 1857 int len;
1510 1858
1511 /* crush */ 1859 ceph_osds_init(raw);
1512 ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, 1860 if (ppps)
1513 pool->type, pool->size); 1861 *ppps = pps;
1862
1863 ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
1864 pi->size);
1514 if (ruleno < 0) { 1865 if (ruleno < 0) {
1515 pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", 1866 pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
1516 pgid.pool, pool->crush_ruleset, pool->type, 1867 pi->id, pi->crush_ruleset, pi->type, pi->size);
1517 pool->size); 1868 return;
1518 return -ENOENT;
1519 } 1869 }
1520 1870
1521 len = do_crush(osdmap, ruleno, pps, osds, 1871 len = do_crush(osdmap, ruleno, pps, raw->osds,
1522 min_t(int, pool->size, CEPH_PG_MAX_SIZE), 1872 min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
1523 osdmap->osd_weight, osdmap->max_osd); 1873 osdmap->osd_weight, osdmap->max_osd);
1524 if (len < 0) { 1874 if (len < 0) {
1525 pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", 1875 pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
1526 len, ruleno, pgid.pool, pool->crush_ruleset, 1876 len, ruleno, pi->id, pi->crush_ruleset, pi->type,
1527 pool->type, pool->size); 1877 pi->size);
1528 return len; 1878 return;
1529 } 1879 }
1530 1880
1531 return len; 1881 raw->size = len;
1532} 1882}
1533 1883
1534/* 1884/*
1535 * Given raw set, calculate up set and up primary. 1885 * Given raw set, calculate up set and up primary. By definition of an
1886 * up set, the result won't contain nonexistent or down OSDs.
1536 * 1887 *
1537 * Return up set length. *primary is set to up primary osd id, or -1 1888 * This is done in-place - on return @set is the up set. If it's
1538 * if up set is empty. 1889 * empty, ->primary will remain undefined.
1539 */ 1890 */
1540static int raw_to_up_osds(struct ceph_osdmap *osdmap, 1891static void raw_to_up_osds(struct ceph_osdmap *osdmap,
1541 struct ceph_pg_pool_info *pool, 1892 struct ceph_pg_pool_info *pi,
1542 int *osds, int len, int *primary) 1893 struct ceph_osds *set)
1543{ 1894{
1544 int up_primary = -1;
1545 int i; 1895 int i;
1546 1896
1547 if (ceph_can_shift_osds(pool)) { 1897 /* ->primary is undefined for a raw set */
1898 BUG_ON(set->primary != -1);
1899
1900 if (ceph_can_shift_osds(pi)) {
1548 int removed = 0; 1901 int removed = 0;
1549 1902
1550 for (i = 0; i < len; i++) { 1903 /* shift left */
1551 if (ceph_osd_is_down(osdmap, osds[i])) { 1904 for (i = 0; i < set->size; i++) {
1905 if (ceph_osd_is_down(osdmap, set->osds[i])) {
1552 removed++; 1906 removed++;
1553 continue; 1907 continue;
1554 } 1908 }
1555 if (removed) 1909 if (removed)
1556 osds[i - removed] = osds[i]; 1910 set->osds[i - removed] = set->osds[i];
1557 } 1911 }
1558 1912 set->size -= removed;
1559 len -= removed; 1913 if (set->size > 0)
1560 if (len > 0) 1914 set->primary = set->osds[0];
1561 up_primary = osds[0];
1562 } else { 1915 } else {
1563 for (i = len - 1; i >= 0; i--) { 1916 /* set down/dne devices to NONE */
1564 if (ceph_osd_is_down(osdmap, osds[i])) 1917 for (i = set->size - 1; i >= 0; i--) {
1565 osds[i] = CRUSH_ITEM_NONE; 1918 if (ceph_osd_is_down(osdmap, set->osds[i]))
1919 set->osds[i] = CRUSH_ITEM_NONE;
1566 else 1920 else
1567 up_primary = osds[i]; 1921 set->primary = set->osds[i];
1568 } 1922 }
1569 } 1923 }
1570
1571 *primary = up_primary;
1572 return len;
1573} 1924}
1574 1925
1575static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, 1926static void apply_primary_affinity(struct ceph_osdmap *osdmap,
1576 struct ceph_pg_pool_info *pool, 1927 struct ceph_pg_pool_info *pi,
1577 int *osds, int len, int *primary) 1928 u32 pps,
1929 struct ceph_osds *up)
1578{ 1930{
1579 int i; 1931 int i;
1580 int pos = -1; 1932 int pos = -1;
@@ -1586,8 +1938,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
1586 if (!osdmap->osd_primary_affinity) 1938 if (!osdmap->osd_primary_affinity)
1587 return; 1939 return;
1588 1940
1589 for (i = 0; i < len; i++) { 1941 for (i = 0; i < up->size; i++) {
1590 int osd = osds[i]; 1942 int osd = up->osds[i];
1591 1943
1592 if (osd != CRUSH_ITEM_NONE && 1944 if (osd != CRUSH_ITEM_NONE &&
1593 osdmap->osd_primary_affinity[osd] != 1945 osdmap->osd_primary_affinity[osd] !=
@@ -1595,7 +1947,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
1595 break; 1947 break;
1596 } 1948 }
1597 } 1949 }
1598 if (i == len) 1950 if (i == up->size)
1599 return; 1951 return;
1600 1952
1601 /* 1953 /*
@@ -1603,8 +1955,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
1603 * osd into the hash/rng so that a proportional fraction of an 1955 * osd into the hash/rng so that a proportional fraction of an
1604 * osd's pgs get rejected as primary. 1956 * osd's pgs get rejected as primary.
1605 */ 1957 */
1606 for (i = 0; i < len; i++) { 1958 for (i = 0; i < up->size; i++) {
1607 int osd = osds[i]; 1959 int osd = up->osds[i];
1608 u32 aff; 1960 u32 aff;
1609 1961
1610 if (osd == CRUSH_ITEM_NONE) 1962 if (osd == CRUSH_ITEM_NONE)
@@ -1629,135 +1981,110 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
1629 if (pos < 0) 1981 if (pos < 0)
1630 return; 1982 return;
1631 1983
1632 *primary = osds[pos]; 1984 up->primary = up->osds[pos];
1633 1985
1634 if (ceph_can_shift_osds(pool) && pos > 0) { 1986 if (ceph_can_shift_osds(pi) && pos > 0) {
1635 /* move the new primary to the front */ 1987 /* move the new primary to the front */
1636 for (i = pos; i > 0; i--) 1988 for (i = pos; i > 0; i--)
1637 osds[i] = osds[i - 1]; 1989 up->osds[i] = up->osds[i - 1];
1638 osds[0] = *primary; 1990 up->osds[0] = up->primary;
1639 } 1991 }
1640} 1992}
1641 1993
1642/* 1994/*
1643 * Given up set, apply pg_temp and primary_temp mappings. 1995 * Get pg_temp and primary_temp mappings for given PG.
1644 * 1996 *
1645 * Return acting set length. *primary is set to acting primary osd id, 1997 * Note that a PG may have none, only pg_temp, only primary_temp or
1646 * or -1 if acting set is empty. 1998 * both pg_temp and primary_temp mappings. This means @temp isn't
1999 * always a valid OSD set on return: in the "only primary_temp" case,
2000 * @temp will have its ->primary >= 0 but ->size == 0.
1647 */ 2001 */
1648static int apply_temps(struct ceph_osdmap *osdmap, 2002static void get_temp_osds(struct ceph_osdmap *osdmap,
1649 struct ceph_pg_pool_info *pool, struct ceph_pg pgid, 2003 struct ceph_pg_pool_info *pi,
1650 int *osds, int len, int *primary) 2004 const struct ceph_pg *raw_pgid,
2005 struct ceph_osds *temp)
1651{ 2006{
2007 struct ceph_pg pgid;
1652 struct ceph_pg_mapping *pg; 2008 struct ceph_pg_mapping *pg;
1653 int temp_len;
1654 int temp_primary;
1655 int i; 2009 int i;
1656 2010
1657 /* raw_pg -> pg */ 2011 raw_pg_to_pg(pi, raw_pgid, &pgid);
1658 pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, 2012 ceph_osds_init(temp);
1659 pool->pg_num_mask);
1660 2013
1661 /* pg_temp? */ 2014 /* pg_temp? */
1662 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 2015 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1663 if (pg) { 2016 if (pg) {
1664 temp_len = 0;
1665 temp_primary = -1;
1666
1667 for (i = 0; i < pg->pg_temp.len; i++) { 2017 for (i = 0; i < pg->pg_temp.len; i++) {
1668 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2018 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
1669 if (ceph_can_shift_osds(pool)) 2019 if (ceph_can_shift_osds(pi))
1670 continue; 2020 continue;
1671 else 2021
1672 osds[temp_len++] = CRUSH_ITEM_NONE; 2022 temp->osds[temp->size++] = CRUSH_ITEM_NONE;
1673 } else { 2023 } else {
1674 osds[temp_len++] = pg->pg_temp.osds[i]; 2024 temp->osds[temp->size++] = pg->pg_temp.osds[i];
1675 } 2025 }
1676 } 2026 }
1677 2027
1678 /* apply pg_temp's primary */ 2028 /* apply pg_temp's primary */
1679 for (i = 0; i < temp_len; i++) { 2029 for (i = 0; i < temp->size; i++) {
1680 if (osds[i] != CRUSH_ITEM_NONE) { 2030 if (temp->osds[i] != CRUSH_ITEM_NONE) {
1681 temp_primary = osds[i]; 2031 temp->primary = temp->osds[i];
1682 break; 2032 break;
1683 } 2033 }
1684 } 2034 }
1685 } else {
1686 temp_len = len;
1687 temp_primary = *primary;
1688 } 2035 }
1689 2036
1690 /* primary_temp? */ 2037 /* primary_temp? */
1691 pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); 2038 pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
1692 if (pg) 2039 if (pg)
1693 temp_primary = pg->primary_temp.osd; 2040 temp->primary = pg->primary_temp.osd;
1694
1695 *primary = temp_primary;
1696 return temp_len;
1697} 2041}
1698 2042
1699/* 2043/*
1700 * Calculate acting set for given pgid. 2044 * Map a PG to its acting set as well as its up set.
1701 * 2045 *
1702 * Return acting set length, or error. *primary is set to acting 2046 * Acting set is used for data mapping purposes, while up set can be
1703 * primary osd id, or -1 if acting set is empty or on error. 2047 * recorded for detecting interval changes and deciding whether to
2048 * resend a request.
1704 */ 2049 */
1705int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, 2050void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
1706 int *osds, int *primary) 2051 const struct ceph_pg *raw_pgid,
2052 struct ceph_osds *up,
2053 struct ceph_osds *acting)
1707{ 2054{
1708 struct ceph_pg_pool_info *pool; 2055 struct ceph_pg_pool_info *pi;
1709 u32 pps; 2056 u32 pps;
1710 int len;
1711 2057
1712 pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); 2058 pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
1713 if (!pool) { 2059 if (!pi) {
1714 *primary = -1; 2060 ceph_osds_init(up);
1715 return -ENOENT; 2061 ceph_osds_init(acting);
2062 goto out;
1716 } 2063 }
1717 2064
1718 if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { 2065 pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
1719 /* hash pool id and seed so that pool PGs do not overlap */ 2066 raw_to_up_osds(osdmap, pi, up);
1720 pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, 2067 apply_primary_affinity(osdmap, pi, pps, up);
1721 ceph_stable_mod(pgid.seed, pool->pgp_num, 2068 get_temp_osds(osdmap, pi, raw_pgid, acting);
1722 pool->pgp_num_mask), 2069 if (!acting->size) {
1723 pgid.pool); 2070 memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
1724 } else { 2071 acting->size = up->size;
1725 /* 2072 if (acting->primary == -1)
1726 * legacy behavior: add ps and pool together. this is 2073 acting->primary = up->primary;
1727 * not a great approach because the PGs from each pool
1728 * will overlap on top of each other: 0.5 == 1.4 ==
1729 * 2.3 == ...
1730 */
1731 pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
1732 pool->pgp_num_mask) +
1733 (unsigned)pgid.pool;
1734 }
1735
1736 len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
1737 if (len < 0) {
1738 *primary = -1;
1739 return len;
1740 } 2074 }
1741 2075out:
1742 len = raw_to_up_osds(osdmap, pool, osds, len, primary); 2076 WARN_ON(!osds_valid(up) || !osds_valid(acting));
1743
1744 apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
1745
1746 len = apply_temps(osdmap, pool, pgid, osds, len, primary);
1747
1748 return len;
1749} 2077}
1750 2078
1751/* 2079/*
1752 * Return primary osd for given pgid, or -1 if none. 2080 * Return acting primary for given PG, or -1 if none.
1753 */ 2081 */
1754int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) 2082int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
2083 const struct ceph_pg *raw_pgid)
1755{ 2084{
1756 int osds[CEPH_PG_MAX_SIZE]; 2085 struct ceph_osds up, acting;
1757 int primary;
1758
1759 ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
1760 2086
1761 return primary; 2087 ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
2088 return acting.primary;
1762} 2089}
1763EXPORT_SYMBOL(ceph_calc_pg_primary); 2090EXPORT_SYMBOL(ceph_pg_to_acting_primary);