summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-05-26 17:10:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-26 17:10:32 -0400
commita10c38a4f385f5d7c173a263ff6bb2d36021b3bb (patch)
tree3cbaa916940b36a9fdb27c8a231e1488fbc352d6
parentea8ea737c46cffa5d0ee74309f81e55a7e5e9c2a (diff)
parente536030934aebf049fe6aaebc58dd37aeee21840 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph updates from Sage Weil: "This changeset has a few main parts: - Ilya has finished a huge refactoring effort to sync up the client-side logic in libceph with the user-space client code, which has evolved significantly over the last couple years, with lots of additional behaviors (e.g., how requests are handled when cluster is full and transitions from full to non-full). This structure of the code is more closely aligned with userspace now such that it will be much easier to maintain going forward when behavior changes take place. There are some locking improvements bundled in as well. - Zheng adds multi-filesystem support (multiple namespaces within the same Ceph cluster) - Zheng has changed the readdir offsets and directory enumeration so that dentry offsets are hash-based and therefore stable across directory fragmentation events on the MDS. - Zheng has a smorgasbord of bug fixes across fs/ceph" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (71 commits) ceph: fix wake_up_session_cb() ceph: don't use truncate_pagecache() to invalidate read cache ceph: SetPageError() for writeback pages if writepages fails ceph: handle interrupted ceph_writepage() ceph: make ceph_update_writeable_page() uninterruptible libceph: make ceph_osdc_wait_request() uninterruptible ceph: handle -EAGAIN returned by ceph_update_writeable_page() ceph: make fault/page_mkwrite return VM_FAULT_OOM for -ENOMEM ceph: block non-fatal signals for fault/page_mkwrite ceph: make logical calculation functions return bool ceph: tolerate bad i_size for symlink inode ceph: improve fragtree change detection ceph: keep leaf frag when updating fragtree ceph: fix dir_auth check in ceph_fill_dirfrag() ceph: don't assume frag tree splits in mds reply are sorted ceph: fix inode reference leak ceph: using hash value to compose dentry offset ceph: don't forbid marking directory complete after forward seek ceph: record 'offset' for each entry of readdir result ceph: define 'end/complete' in readdir reply as bit flags ...
-rw-r--r--drivers/block/rbd.c305
-rw-r--r--fs/ceph/addr.c214
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c51
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/dir.c376
-rw-r--r--fs/ceph/file.c89
-rw-r--r--fs/ceph/inode.c159
-rw-r--r--fs/ceph/ioctl.c14
-rw-r--r--fs/ceph/mds_client.c140
-rw-r--r--fs/ceph/mds_client.h17
-rw-r--r--fs/ceph/mdsmap.c43
-rw-r--r--fs/ceph/super.c47
-rw-r--r--fs/ceph/super.h12
-rw-r--r--fs/ceph/xattr.c25
-rw-r--r--include/linux/ceph/ceph_frag.h4
-rw-r--r--include/linux/ceph/ceph_fs.h20
-rw-r--r--include/linux/ceph/decode.h2
-rw-r--r--include/linux/ceph/libceph.h57
-rw-r--r--include/linux/ceph/mon_client.h23
-rw-r--r--include/linux/ceph/osd_client.h231
-rw-r--r--include/linux/ceph/osdmap.h158
-rw-r--r--include/linux/ceph/rados.h34
-rw-r--r--net/ceph/ceph_common.c2
-rw-r--r--net/ceph/ceph_strings.c16
-rw-r--r--net/ceph/debugfs.c147
-rw-r--r--net/ceph/mon_client.c393
-rw-r--r--net/ceph/osd_client.c4032
-rw-r--r--net/ceph/osdmap.c651
29 files changed, 4758 insertions, 2508 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0ede6d7e2568..81666a56415e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -350,12 +350,12 @@ struct rbd_device {
350 struct rbd_spec *spec; 350 struct rbd_spec *spec;
351 struct rbd_options *opts; 351 struct rbd_options *opts;
352 352
353 char *header_name; 353 struct ceph_object_id header_oid;
354 struct ceph_object_locator header_oloc;
354 355
355 struct ceph_file_layout layout; 356 struct ceph_file_layout layout;
356 357
357 struct ceph_osd_event *watch_event; 358 struct ceph_osd_linger_request *watch_handle;
358 struct rbd_obj_request *watch_request;
359 359
360 struct rbd_spec *parent_spec; 360 struct rbd_spec *parent_spec;
361 u64 parent_overlap; 361 u64 parent_overlap;
@@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1596 return __rbd_obj_request_wait(obj_request, 0); 1596 return __rbd_obj_request_wait(obj_request, 0);
1597} 1597}
1598 1598
1599static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
1600 unsigned long timeout)
1601{
1602 return __rbd_obj_request_wait(obj_request, timeout);
1603}
1604
1605static void rbd_img_request_complete(struct rbd_img_request *img_request) 1599static void rbd_img_request_complete(struct rbd_img_request *img_request)
1606{ 1600{
1607 1601
@@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1751 complete_all(&obj_request->completion); 1745 complete_all(&obj_request->completion);
1752} 1746}
1753 1747
1754static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1755{
1756 dout("%s: obj %p\n", __func__, obj_request);
1757 obj_request_done_set(obj_request);
1758}
1759
1760static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1748static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1761{ 1749{
1762 struct rbd_img_request *img_request = NULL; 1750 struct rbd_img_request *img_request = NULL;
@@ -1828,13 +1816,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
1828 obj_request_done_set(obj_request); 1816 obj_request_done_set(obj_request);
1829} 1817}
1830 1818
1831static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, 1819static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1832 struct ceph_msg *msg)
1833{ 1820{
1834 struct rbd_obj_request *obj_request = osd_req->r_priv; 1821 struct rbd_obj_request *obj_request = osd_req->r_priv;
1835 u16 opcode; 1822 u16 opcode;
1836 1823
1837 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); 1824 dout("%s: osd_req %p\n", __func__, osd_req);
1838 rbd_assert(osd_req == obj_request->osd_req); 1825 rbd_assert(osd_req == obj_request->osd_req);
1839 if (obj_request_img_data_test(obj_request)) { 1826 if (obj_request_img_data_test(obj_request)) {
1840 rbd_assert(obj_request->img_request); 1827 rbd_assert(obj_request->img_request);
@@ -1878,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1878 case CEPH_OSD_OP_CALL: 1865 case CEPH_OSD_OP_CALL:
1879 rbd_osd_call_callback(obj_request); 1866 rbd_osd_call_callback(obj_request);
1880 break; 1867 break;
1881 case CEPH_OSD_OP_NOTIFY_ACK:
1882 case CEPH_OSD_OP_WATCH:
1883 rbd_osd_trivial_callback(obj_request);
1884 break;
1885 default: 1868 default:
1886 rbd_warn(NULL, "%s: unsupported op %hu", 1869 rbd_warn(NULL, "%s: unsupported op %hu",
1887 obj_request->object_name, (unsigned short) opcode); 1870 obj_request->object_name, (unsigned short) opcode);
@@ -1896,27 +1879,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1896{ 1879{
1897 struct rbd_img_request *img_request = obj_request->img_request; 1880 struct rbd_img_request *img_request = obj_request->img_request;
1898 struct ceph_osd_request *osd_req = obj_request->osd_req; 1881 struct ceph_osd_request *osd_req = obj_request->osd_req;
1899 u64 snap_id;
1900 1882
1901 rbd_assert(osd_req != NULL); 1883 if (img_request)
1902 1884 osd_req->r_snapid = img_request->snap_id;
1903 snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
1904 ceph_osdc_build_request(osd_req, obj_request->offset,
1905 NULL, snap_id, NULL);
1906} 1885}
1907 1886
1908static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 1887static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1909{ 1888{
1910 struct rbd_img_request *img_request = obj_request->img_request;
1911 struct ceph_osd_request *osd_req = obj_request->osd_req; 1889 struct ceph_osd_request *osd_req = obj_request->osd_req;
1912 struct ceph_snap_context *snapc;
1913 struct timespec mtime = CURRENT_TIME;
1914 1890
1915 rbd_assert(osd_req != NULL); 1891 osd_req->r_mtime = CURRENT_TIME;
1916 1892 osd_req->r_data_offset = obj_request->offset;
1917 snapc = img_request ? img_request->snapc : NULL;
1918 ceph_osdc_build_request(osd_req, obj_request->offset,
1919 snapc, CEPH_NOSNAP, &mtime);
1920} 1893}
1921 1894
1922/* 1895/*
@@ -1954,7 +1927,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
1954 osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, 1927 osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
1955 GFP_NOIO); 1928 GFP_NOIO);
1956 if (!osd_req) 1929 if (!osd_req)
1957 return NULL; /* ENOMEM */ 1930 goto fail;
1958 1931
1959 if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 1932 if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
1960 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1933 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
@@ -1965,9 +1938,18 @@ static struct ceph_osd_request *rbd_osd_req_create(
1965 osd_req->r_priv = obj_request; 1938 osd_req->r_priv = obj_request;
1966 1939
1967 osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 1940 osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
1968 ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 1941 if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
1942 obj_request->object_name))
1943 goto fail;
1944
1945 if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
1946 goto fail;
1969 1947
1970 return osd_req; 1948 return osd_req;
1949
1950fail:
1951 ceph_osdc_put_request(osd_req);
1952 return NULL;
1971} 1953}
1972 1954
1973/* 1955/*
@@ -2003,16 +1985,25 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
2003 osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, 1985 osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
2004 false, GFP_NOIO); 1986 false, GFP_NOIO);
2005 if (!osd_req) 1987 if (!osd_req)
2006 return NULL; /* ENOMEM */ 1988 goto fail;
2007 1989
2008 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1990 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
2009 osd_req->r_callback = rbd_osd_req_callback; 1991 osd_req->r_callback = rbd_osd_req_callback;
2010 osd_req->r_priv = obj_request; 1992 osd_req->r_priv = obj_request;
2011 1993
2012 osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); 1994 osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
2013 ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); 1995 if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
1996 obj_request->object_name))
1997 goto fail;
1998
1999 if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO))
2000 goto fail;
2014 2001
2015 return osd_req; 2002 return osd_req;
2003
2004fail:
2005 ceph_osdc_put_request(osd_req);
2006 return NULL;
2016} 2007}
2017 2008
2018 2009
@@ -2973,17 +2964,20 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request)
2973{ 2964{
2974 struct rbd_obj_request *obj_request; 2965 struct rbd_obj_request *obj_request;
2975 struct rbd_obj_request *next_obj_request; 2966 struct rbd_obj_request *next_obj_request;
2967 int ret = 0;
2976 2968
2977 dout("%s: img %p\n", __func__, img_request); 2969 dout("%s: img %p\n", __func__, img_request);
2978 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2979 int ret;
2980 2970
2971 rbd_img_request_get(img_request);
2972 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2981 ret = rbd_img_obj_request_submit(obj_request); 2973 ret = rbd_img_obj_request_submit(obj_request);
2982 if (ret) 2974 if (ret)
2983 return ret; 2975 goto out_put_ireq;
2984 } 2976 }
2985 2977
2986 return 0; 2978out_put_ireq:
2979 rbd_img_request_put(img_request);
2980 return ret;
2987} 2981}
2988 2982
2989static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 2983static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
@@ -3090,45 +3084,18 @@ out_err:
3090 obj_request_done_set(obj_request); 3084 obj_request_done_set(obj_request);
3091} 3085}
3092 3086
3093static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) 3087static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev);
3094{ 3088static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev);
3095 struct rbd_obj_request *obj_request;
3096 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3097 int ret;
3098
3099 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
3100 OBJ_REQUEST_NODATA);
3101 if (!obj_request)
3102 return -ENOMEM;
3103
3104 ret = -ENOMEM;
3105 obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
3106 obj_request);
3107 if (!obj_request->osd_req)
3108 goto out;
3109
3110 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
3111 notify_id, 0, 0);
3112 rbd_osd_req_format_read(obj_request);
3113 3089
3114 ret = rbd_obj_request_submit(osdc, obj_request); 3090static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3115 if (ret) 3091 u64 notifier_id, void *data, size_t data_len)
3116 goto out;
3117 ret = rbd_obj_request_wait(obj_request);
3118out:
3119 rbd_obj_request_put(obj_request);
3120
3121 return ret;
3122}
3123
3124static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
3125{ 3092{
3126 struct rbd_device *rbd_dev = (struct rbd_device *)data; 3093 struct rbd_device *rbd_dev = arg;
3094 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3127 int ret; 3095 int ret;
3128 3096
3129 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, 3097 dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev,
3130 rbd_dev->header_name, (unsigned long long)notify_id, 3098 cookie, notify_id);
3131 (unsigned int)opcode);
3132 3099
3133 /* 3100 /*
3134 * Until adequate refresh error handling is in place, there is 3101 * Until adequate refresh error handling is in place, there is
@@ -3140,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
3140 if (ret) 3107 if (ret)
3141 rbd_warn(rbd_dev, "refresh failed: %d", ret); 3108 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3142 3109
3143 ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id); 3110 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3111 &rbd_dev->header_oloc, notify_id, cookie,
3112 NULL, 0);
3144 if (ret) 3113 if (ret)
3145 rbd_warn(rbd_dev, "notify_ack ret %d", ret); 3114 rbd_warn(rbd_dev, "notify_ack ret %d", ret);
3146} 3115}
3147 3116
3148/* 3117static void rbd_watch_errcb(void *arg, u64 cookie, int err)
3149 * Send a (un)watch request and wait for the ack. Return a request
3150 * with a ref held on success or error.
3151 */
3152static struct rbd_obj_request *rbd_obj_watch_request_helper(
3153 struct rbd_device *rbd_dev,
3154 bool watch)
3155{ 3118{
3156 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3119 struct rbd_device *rbd_dev = arg;
3157 struct ceph_options *opts = osdc->client->options;
3158 struct rbd_obj_request *obj_request;
3159 int ret; 3120 int ret;
3160 3121
3161 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 3122 rbd_warn(rbd_dev, "encountered watch error: %d", err);
3162 OBJ_REQUEST_NODATA);
3163 if (!obj_request)
3164 return ERR_PTR(-ENOMEM);
3165
3166 obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
3167 obj_request);
3168 if (!obj_request->osd_req) {
3169 ret = -ENOMEM;
3170 goto out;
3171 }
3172
3173 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
3174 rbd_dev->watch_event->cookie, 0, watch);
3175 rbd_osd_req_format_write(obj_request);
3176 3123
3177 if (watch) 3124 __rbd_dev_header_unwatch_sync(rbd_dev);
3178 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
3179
3180 ret = rbd_obj_request_submit(osdc, obj_request);
3181 if (ret)
3182 goto out;
3183 3125
3184 ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout); 3126 ret = rbd_dev_header_watch_sync(rbd_dev);
3185 if (ret)
3186 goto out;
3187
3188 ret = obj_request->result;
3189 if (ret) { 3127 if (ret) {
3190 if (watch) 3128 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
3191 rbd_obj_request_end(obj_request); 3129 return;
3192 goto out;
3193 } 3130 }
3194 3131
3195 return obj_request; 3132 ret = rbd_dev_refresh(rbd_dev);
3196 3133 if (ret)
3197out: 3134 rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
3198 rbd_obj_request_put(obj_request);
3199 return ERR_PTR(ret);
3200} 3135}
3201 3136
3202/* 3137/*
@@ -3205,35 +3140,33 @@ out:
3205static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) 3140static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
3206{ 3141{
3207 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3142 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3208 struct rbd_obj_request *obj_request; 3143 struct ceph_osd_linger_request *handle;
3209 int ret;
3210 3144
3211 rbd_assert(!rbd_dev->watch_event); 3145 rbd_assert(!rbd_dev->watch_handle);
3212 rbd_assert(!rbd_dev->watch_request);
3213 3146
3214 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, 3147 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3215 &rbd_dev->watch_event); 3148 &rbd_dev->header_oloc, rbd_watch_cb,
3216 if (ret < 0) 3149 rbd_watch_errcb, rbd_dev);
3217 return ret; 3150 if (IS_ERR(handle))
3151 return PTR_ERR(handle);
3218 3152
3219 obj_request = rbd_obj_watch_request_helper(rbd_dev, true); 3153 rbd_dev->watch_handle = handle;
3220 if (IS_ERR(obj_request)) { 3154 return 0;
3221 ceph_osdc_cancel_event(rbd_dev->watch_event); 3155}
3222 rbd_dev->watch_event = NULL;
3223 return PTR_ERR(obj_request);
3224 }
3225 3156
3226 /* 3157static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
3227 * A watch request is set to linger, so the underlying osd 3158{
3228 * request won't go away until we unregister it. We retain 3159 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3229 * a pointer to the object request during that time (in 3160 int ret;
3230 * rbd_dev->watch_request), so we'll keep a reference to it.
3231 * We'll drop that reference after we've unregistered it in
3232 * rbd_dev_header_unwatch_sync().
3233 */
3234 rbd_dev->watch_request = obj_request;
3235 3161
3236 return 0; 3162 if (!rbd_dev->watch_handle)
3163 return;
3164
3165 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3166 if (ret)
3167 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
3168
3169 rbd_dev->watch_handle = NULL;
3237} 3170}
3238 3171
3239/* 3172/*
@@ -3241,24 +3174,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
3241 */ 3174 */
3242static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 3175static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
3243{ 3176{
3244 struct rbd_obj_request *obj_request; 3177 __rbd_dev_header_unwatch_sync(rbd_dev);
3245
3246 rbd_assert(rbd_dev->watch_event);
3247 rbd_assert(rbd_dev->watch_request);
3248
3249 rbd_obj_request_end(rbd_dev->watch_request);
3250 rbd_obj_request_put(rbd_dev->watch_request);
3251 rbd_dev->watch_request = NULL;
3252
3253 obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
3254 if (!IS_ERR(obj_request))
3255 rbd_obj_request_put(obj_request);
3256 else
3257 rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
3258 PTR_ERR(obj_request));
3259
3260 ceph_osdc_cancel_event(rbd_dev->watch_event);
3261 rbd_dev->watch_event = NULL;
3262 3178
3263 dout("%s flushing notifies\n", __func__); 3179 dout("%s flushing notifies\n", __func__);
3264 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 3180 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
@@ -3591,7 +3507,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
3591 if (!ondisk) 3507 if (!ondisk)
3592 return -ENOMEM; 3508 return -ENOMEM;
3593 3509
3594 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, 3510 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name,
3595 0, size, ondisk); 3511 0, size, ondisk);
3596 if (ret < 0) 3512 if (ret < 0)
3597 goto out; 3513 goto out;
@@ -4033,6 +3949,8 @@ static void rbd_dev_release(struct device *dev)
4033 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 3949 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4034 bool need_put = !!rbd_dev->opts; 3950 bool need_put = !!rbd_dev->opts;
4035 3951
3952 ceph_oid_destroy(&rbd_dev->header_oid);
3953
4036 rbd_put_client(rbd_dev->rbd_client); 3954 rbd_put_client(rbd_dev->rbd_client);
4037 rbd_spec_put(rbd_dev->spec); 3955 rbd_spec_put(rbd_dev->spec);
4038 kfree(rbd_dev->opts); 3956 kfree(rbd_dev->opts);
@@ -4063,6 +3981,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4063 INIT_LIST_HEAD(&rbd_dev->node); 3981 INIT_LIST_HEAD(&rbd_dev->node);
4064 init_rwsem(&rbd_dev->header_rwsem); 3982 init_rwsem(&rbd_dev->header_rwsem);
4065 3983
3984 ceph_oid_init(&rbd_dev->header_oid);
3985 ceph_oloc_init(&rbd_dev->header_oloc);
3986
4066 rbd_dev->dev.bus = &rbd_bus_type; 3987 rbd_dev->dev.bus = &rbd_bus_type;
4067 rbd_dev->dev.type = &rbd_device_type; 3988 rbd_dev->dev.type = &rbd_device_type;
4068 rbd_dev->dev.parent = &rbd_root_dev; 3989 rbd_dev->dev.parent = &rbd_root_dev;
@@ -4111,7 +4032,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4111 __le64 size; 4032 __le64 size;
4112 } __attribute__ ((packed)) size_buf = { 0 }; 4033 } __attribute__ ((packed)) size_buf = { 0 };
4113 4034
4114 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4035 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4115 "rbd", "get_size", 4036 "rbd", "get_size",
4116 &snapid, sizeof (snapid), 4037 &snapid, sizeof (snapid),
4117 &size_buf, sizeof (size_buf)); 4038 &size_buf, sizeof (size_buf));
@@ -4151,7 +4072,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4151 if (!reply_buf) 4072 if (!reply_buf)
4152 return -ENOMEM; 4073 return -ENOMEM;
4153 4074
4154 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4075 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4155 "rbd", "get_object_prefix", NULL, 0, 4076 "rbd", "get_object_prefix", NULL, 0,
4156 reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 4077 reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
4157 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4078 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4186,7 +4107,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4186 u64 unsup; 4107 u64 unsup;
4187 int ret; 4108 int ret;
4188 4109
4189 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4110 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4190 "rbd", "get_features", 4111 "rbd", "get_features",
4191 &snapid, sizeof (snapid), 4112 &snapid, sizeof (snapid),
4192 &features_buf, sizeof (features_buf)); 4113 &features_buf, sizeof (features_buf));
@@ -4248,7 +4169,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4248 } 4169 }
4249 4170
4250 snapid = cpu_to_le64(rbd_dev->spec->snap_id); 4171 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
4251 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4172 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4252 "rbd", "get_parent", 4173 "rbd", "get_parent",
4253 &snapid, sizeof (snapid), 4174 &snapid, sizeof (snapid),
4254 reply_buf, size); 4175 reply_buf, size);
@@ -4351,7 +4272,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4351 u64 stripe_count; 4272 u64 stripe_count;
4352 int ret; 4273 int ret;
4353 4274
4354 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4275 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4355 "rbd", "get_stripe_unit_count", NULL, 0, 4276 "rbd", "get_stripe_unit_count", NULL, 0,
4356 (char *)&striping_info_buf, size); 4277 (char *)&striping_info_buf, size);
4357 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4278 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4599,7 +4520,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
4599 if (!reply_buf) 4520 if (!reply_buf)
4600 return -ENOMEM; 4521 return -ENOMEM;
4601 4522
4602 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4523 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4603 "rbd", "get_snapcontext", NULL, 0, 4524 "rbd", "get_snapcontext", NULL, 0,
4604 reply_buf, size); 4525 reply_buf, size);
4605 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4526 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
@@ -4664,7 +4585,7 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4664 return ERR_PTR(-ENOMEM); 4585 return ERR_PTR(-ENOMEM);
4665 4586
4666 snapid = cpu_to_le64(snap_id); 4587 snapid = cpu_to_le64(snap_id);
4667 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, 4588 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name,
4668 "rbd", "get_snapshot_name", 4589 "rbd", "get_snapshot_name",
4669 &snapid, sizeof (snapid), 4590 &snapid, sizeof (snapid),
4670 reply_buf, size); 4591 reply_buf, size);
@@ -4975,13 +4896,13 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
4975again: 4896again:
4976 ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); 4897 ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
4977 if (ret == -ENOENT && tries++ < 1) { 4898 if (ret == -ENOENT && tries++ < 1) {
4978 ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", 4899 ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
4979 &newest_epoch); 4900 &newest_epoch);
4980 if (ret < 0) 4901 if (ret < 0)
4981 return ret; 4902 return ret;
4982 4903
4983 if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { 4904 if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
4984 ceph_monc_request_next_osdmap(&rbdc->client->monc); 4905 ceph_osdc_maybe_request_map(&rbdc->client->osdc);
4985 (void) ceph_monc_wait_osdmap(&rbdc->client->monc, 4906 (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
4986 newest_epoch, 4907 newest_epoch,
4987 opts->mount_timeout); 4908 opts->mount_timeout);
@@ -5260,35 +5181,26 @@ err_out_unlock:
5260static int rbd_dev_header_name(struct rbd_device *rbd_dev) 5181static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5261{ 5182{
5262 struct rbd_spec *spec = rbd_dev->spec; 5183 struct rbd_spec *spec = rbd_dev->spec;
5263 size_t size; 5184 int ret;
5264 5185
5265 /* Record the header object name for this rbd image. */ 5186 /* Record the header object name for this rbd image. */
5266 5187
5267 rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5188 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5268 5189
5190 rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
5269 if (rbd_dev->image_format == 1) 5191 if (rbd_dev->image_format == 1)
5270 size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); 5192 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5193 spec->image_name, RBD_SUFFIX);
5271 else 5194 else
5272 size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); 5195 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5273 5196 RBD_HEADER_PREFIX, spec->image_id);
5274 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
5275 if (!rbd_dev->header_name)
5276 return -ENOMEM;
5277 5197
5278 if (rbd_dev->image_format == 1) 5198 return ret;
5279 sprintf(rbd_dev->header_name, "%s%s",
5280 spec->image_name, RBD_SUFFIX);
5281 else
5282 sprintf(rbd_dev->header_name, "%s%s",
5283 RBD_HEADER_PREFIX, spec->image_id);
5284 return 0;
5285} 5199}
5286 5200
5287static void rbd_dev_image_release(struct rbd_device *rbd_dev) 5201static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5288{ 5202{
5289 rbd_dev_unprobe(rbd_dev); 5203 rbd_dev_unprobe(rbd_dev);
5290 kfree(rbd_dev->header_name);
5291 rbd_dev->header_name = NULL;
5292 rbd_dev->image_format = 0; 5204 rbd_dev->image_format = 0;
5293 kfree(rbd_dev->spec->image_id); 5205 kfree(rbd_dev->spec->image_id);
5294 rbd_dev->spec->image_id = NULL; 5206 rbd_dev->spec->image_id = NULL;
@@ -5327,7 +5239,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
5327 pr_info("image %s/%s does not exist\n", 5239 pr_info("image %s/%s does not exist\n",
5328 rbd_dev->spec->pool_name, 5240 rbd_dev->spec->pool_name,
5329 rbd_dev->spec->image_name); 5241 rbd_dev->spec->image_name);
5330 goto out_header_name; 5242 goto err_out_format;
5331 } 5243 }
5332 } 5244 }
5333 5245
@@ -5373,7 +5285,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
5373 goto err_out_probe; 5285 goto err_out_probe;
5374 5286
5375 dout("discovered format %u image, header name is %s\n", 5287 dout("discovered format %u image, header name is %s\n",
5376 rbd_dev->image_format, rbd_dev->header_name); 5288 rbd_dev->image_format, rbd_dev->header_oid.name);
5377 return 0; 5289 return 0;
5378 5290
5379err_out_probe: 5291err_out_probe:
@@ -5381,9 +5293,6 @@ err_out_probe:
5381err_out_watch: 5293err_out_watch:
5382 if (!depth) 5294 if (!depth)
5383 rbd_dev_header_unwatch_sync(rbd_dev); 5295 rbd_dev_header_unwatch_sync(rbd_dev);
5384out_header_name:
5385 kfree(rbd_dev->header_name);
5386 rbd_dev->header_name = NULL;
5387err_out_format: 5296err_out_format:
5388 rbd_dev->image_format = 0; 5297 rbd_dev->image_format = 0;
5389 kfree(rbd_dev->spec->image_id); 5298 kfree(rbd_dev->spec->image_id);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 43098cd9602b..eeb71e5de27a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page)
257/* 257/*
258 * Finish an async read(ahead) op. 258 * Finish an async read(ahead) op.
259 */ 259 */
260static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) 260static void finish_read(struct ceph_osd_request *req)
261{ 261{
262 struct inode *inode = req->r_inode; 262 struct inode *inode = req->r_inode;
263 struct ceph_osd_data *osd_data; 263 struct ceph_osd_data *osd_data;
264 int rc = req->r_result; 264 int rc = req->r_result <= 0 ? req->r_result : 0;
265 int bytes = le32_to_cpu(msg->hdr.data_len); 265 int bytes = req->r_result >= 0 ? req->r_result : 0;
266 int num_pages; 266 int num_pages;
267 int i; 267 int i;
268 268
@@ -376,8 +376,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
376 req->r_callback = finish_read; 376 req->r_callback = finish_read;
377 req->r_inode = inode; 377 req->r_inode = inode;
378 378
379 ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
380
381 dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); 379 dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
382 ret = ceph_osdc_start_request(osdc, req, false); 380 ret = ceph_osdc_start_request(osdc, req, false);
383 if (ret < 0) 381 if (ret < 0)
@@ -546,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
546 truncate_seq, truncate_size, 544 truncate_seq, truncate_size,
547 &inode->i_mtime, &page, 1); 545 &inode->i_mtime, &page, 1);
548 if (err < 0) { 546 if (err < 0) {
549 dout("writepage setting page/mapping error %d %p\n", err, page); 547 struct writeback_control tmp_wbc;
548 if (!wbc)
549 wbc = &tmp_wbc;
550 if (err == -ERESTARTSYS) {
551 /* killed by SIGKILL */
552 dout("writepage interrupted page %p\n", page);
553 redirty_page_for_writepage(wbc, page);
554 end_page_writeback(page);
555 goto out;
556 }
557 dout("writepage setting page/mapping error %d %p\n",
558 err, page);
550 SetPageError(page); 559 SetPageError(page);
551 mapping_set_error(&inode->i_data, err); 560 mapping_set_error(&inode->i_data, err);
552 if (wbc) 561 wbc->pages_skipped++;
553 wbc->pages_skipped++;
554 } else { 562 } else {
555 dout("writepage cleaned page %p\n", page); 563 dout("writepage cleaned page %p\n", page);
556 err = 0; /* vfs expects us to return 0 */ 564 err = 0; /* vfs expects us to return 0 */
@@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
571 BUG_ON(!inode); 579 BUG_ON(!inode);
572 ihold(inode); 580 ihold(inode);
573 err = writepage_nounlock(page, wbc); 581 err = writepage_nounlock(page, wbc);
582 if (err == -ERESTARTSYS) {
583 /* direct memory reclaimer was killed by SIGKILL. return 0
584 * to prevent caller from setting mapping/page error */
585 err = 0;
586 }
574 unlock_page(page); 587 unlock_page(page);
575 iput(inode); 588 iput(inode);
576 return err; 589 return err;
577} 590}
578 591
579
580/* 592/*
581 * lame release_pages helper. release_pages() isn't exported to 593 * lame release_pages helper. release_pages() isn't exported to
582 * modules. 594 * modules.
@@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num)
600 * If we get an error, set the mapping error bit, but not the individual 612 * If we get an error, set the mapping error bit, but not the individual
601 * page error bits. 613 * page error bits.
602 */ 614 */
603static void writepages_finish(struct ceph_osd_request *req, 615static void writepages_finish(struct ceph_osd_request *req)
604 struct ceph_msg *msg)
605{ 616{
606 struct inode *inode = req->r_inode; 617 struct inode *inode = req->r_inode;
607 struct ceph_inode_info *ci = ceph_inode(inode); 618 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req,
615 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 626 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
616 bool remove_page; 627 bool remove_page;
617 628
618
619 dout("writepages_finish %p rc %d\n", inode, rc); 629 dout("writepages_finish %p rc %d\n", inode, rc);
620 if (rc < 0) 630 if (rc < 0)
621 mapping_set_error(mapping, rc); 631 mapping_set_error(mapping, rc);
@@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req,
650 clear_bdi_congested(&fsc->backing_dev_info, 660 clear_bdi_congested(&fsc->backing_dev_info,
651 BLK_RW_ASYNC); 661 BLK_RW_ASYNC);
652 662
663 if (rc < 0)
664 SetPageError(page);
665
653 ceph_put_snap_context(page_snap_context(page)); 666 ceph_put_snap_context(page_snap_context(page));
654 page->private = 0; 667 page->private = 0;
655 ClearPagePrivate(page); 668 ClearPagePrivate(page);
@@ -718,8 +731,11 @@ static int ceph_writepages_start(struct address_space *mapping,
718 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 731 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
719 732
720 if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { 733 if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
721 pr_warn("writepage_start %p on forced umount\n", inode); 734 if (ci->i_wrbuffer_ref > 0) {
722 truncate_pagecache(inode, 0); 735 pr_warn_ratelimited(
736 "writepage_start %p %lld forced umount\n",
737 inode, ceph_ino(inode));
738 }
723 mapping_set_error(mapping, -EIO); 739 mapping_set_error(mapping, -EIO);
724 return -EIO; /* we're in a forced umount, don't write! */ 740 return -EIO; /* we're in a forced umount, don't write! */
725 } 741 }
@@ -1063,10 +1079,7 @@ new_request:
1063 pages = NULL; 1079 pages = NULL;
1064 } 1080 }
1065 1081
1066 vino = ceph_vino(inode); 1082 req->r_mtime = inode->i_mtime;
1067 ceph_osdc_build_request(req, offset, snapc, vino.snap,
1068 &inode->i_mtime);
1069
1070 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); 1083 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
1071 BUG_ON(rc); 1084 BUG_ON(rc);
1072 req = NULL; 1085 req = NULL;
@@ -1099,8 +1112,7 @@ release_pvec_pages:
1099 mapping->writeback_index = index; 1112 mapping->writeback_index = index;
1100 1113
1101out: 1114out:
1102 if (req) 1115 ceph_osdc_put_request(req);
1103 ceph_osdc_put_request(req);
1104 ceph_put_snap_context(snapc); 1116 ceph_put_snap_context(snapc);
1105 dout("writepages done, rc = %d\n", rc); 1117 dout("writepages done, rc = %d\n", rc);
1106 return rc; 1118 return rc;
@@ -1134,6 +1146,7 @@ static int ceph_update_writeable_page(struct file *file,
1134 struct page *page) 1146 struct page *page)
1135{ 1147{
1136 struct inode *inode = file_inode(file); 1148 struct inode *inode = file_inode(file);
1149 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1137 struct ceph_inode_info *ci = ceph_inode(inode); 1150 struct ceph_inode_info *ci = ceph_inode(inode);
1138 loff_t page_off = pos & PAGE_MASK; 1151 loff_t page_off = pos & PAGE_MASK;
1139 int pos_in_page = pos & ~PAGE_MASK; 1152 int pos_in_page = pos & ~PAGE_MASK;
@@ -1142,6 +1155,12 @@ static int ceph_update_writeable_page(struct file *file,
1142 int r; 1155 int r;
1143 struct ceph_snap_context *snapc, *oldest; 1156 struct ceph_snap_context *snapc, *oldest;
1144 1157
1158 if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
1159 dout(" page %p forced umount\n", page);
1160 unlock_page(page);
1161 return -EIO;
1162 }
1163
1145retry_locked: 1164retry_locked:
1146 /* writepages currently holds page lock, but if we change that later, */ 1165 /* writepages currently holds page lock, but if we change that later, */
1147 wait_on_page_writeback(page); 1166 wait_on_page_writeback(page);
@@ -1165,7 +1184,7 @@ retry_locked:
1165 snapc = ceph_get_snap_context(snapc); 1184 snapc = ceph_get_snap_context(snapc);
1166 unlock_page(page); 1185 unlock_page(page);
1167 ceph_queue_writeback(inode); 1186 ceph_queue_writeback(inode);
1168 r = wait_event_interruptible(ci->i_cap_wq, 1187 r = wait_event_killable(ci->i_cap_wq,
1169 context_is_writeable_or_written(inode, snapc)); 1188 context_is_writeable_or_written(inode, snapc));
1170 ceph_put_snap_context(snapc); 1189 ceph_put_snap_context(snapc);
1171 if (r == -ERESTARTSYS) 1190 if (r == -ERESTARTSYS)
@@ -1311,6 +1330,17 @@ const struct address_space_operations ceph_aops = {
1311 .direct_IO = ceph_direct_io, 1330 .direct_IO = ceph_direct_io,
1312}; 1331};
1313 1332
1333static void ceph_block_sigs(sigset_t *oldset)
1334{
1335 sigset_t mask;
1336 siginitsetinv(&mask, sigmask(SIGKILL));
1337 sigprocmask(SIG_BLOCK, &mask, oldset);
1338}
1339
1340static void ceph_restore_sigs(sigset_t *oldset)
1341{
1342 sigprocmask(SIG_SETMASK, oldset, NULL);
1343}
1314 1344
1315/* 1345/*
1316 * vm ops 1346 * vm ops
@@ -1323,6 +1353,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1323 struct page *pinned_page = NULL; 1353 struct page *pinned_page = NULL;
1324 loff_t off = vmf->pgoff << PAGE_SHIFT; 1354 loff_t off = vmf->pgoff << PAGE_SHIFT;
1325 int want, got, ret; 1355 int want, got, ret;
1356 sigset_t oldset;
1357
1358 ceph_block_sigs(&oldset);
1326 1359
1327 dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", 1360 dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
1328 inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE); 1361 inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
@@ -1330,17 +1363,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1330 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 1363 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
1331 else 1364 else
1332 want = CEPH_CAP_FILE_CACHE; 1365 want = CEPH_CAP_FILE_CACHE;
1333 while (1) { 1366
1334 got = 0; 1367 got = 0;
1335 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, 1368 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page);
1336 -1, &got, &pinned_page); 1369 if (ret < 0)
1337 if (ret == 0) 1370 goto out_restore;
1338 break; 1371
1339 if (ret != -ERESTARTSYS) {
1340 WARN_ON(1);
1341 return VM_FAULT_SIGBUS;
1342 }
1343 }
1344 dout("filemap_fault %p %llu~%zd got cap refs on %s\n", 1372 dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
1345 inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); 1373 inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
1346 1374
@@ -1357,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1357 ceph_put_cap_refs(ci, got); 1385 ceph_put_cap_refs(ci, got);
1358 1386
1359 if (ret != -EAGAIN) 1387 if (ret != -EAGAIN)
1360 return ret; 1388 goto out_restore;
1361 1389
1362 /* read inline data */ 1390 /* read inline data */
1363 if (off >= PAGE_SIZE) { 1391 if (off >= PAGE_SIZE) {
@@ -1371,15 +1399,18 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1371 ~__GFP_FS)); 1399 ~__GFP_FS));
1372 if (!page) { 1400 if (!page) {
1373 ret = VM_FAULT_OOM; 1401 ret = VM_FAULT_OOM;
1374 goto out; 1402 goto out_inline;
1375 } 1403 }
1376 ret1 = __ceph_do_getattr(inode, page, 1404 ret1 = __ceph_do_getattr(inode, page,
1377 CEPH_STAT_CAP_INLINE_DATA, true); 1405 CEPH_STAT_CAP_INLINE_DATA, true);
1378 if (ret1 < 0 || off >= i_size_read(inode)) { 1406 if (ret1 < 0 || off >= i_size_read(inode)) {
1379 unlock_page(page); 1407 unlock_page(page);
1380 put_page(page); 1408 put_page(page);
1381 ret = VM_FAULT_SIGBUS; 1409 if (ret1 < 0)
1382 goto out; 1410 ret = ret1;
1411 else
1412 ret = VM_FAULT_SIGBUS;
1413 goto out_inline;
1383 } 1414 }
1384 if (ret1 < PAGE_SIZE) 1415 if (ret1 < PAGE_SIZE)
1385 zero_user_segment(page, ret1, PAGE_SIZE); 1416 zero_user_segment(page, ret1, PAGE_SIZE);
@@ -1388,10 +1419,15 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1388 SetPageUptodate(page); 1419 SetPageUptodate(page);
1389 vmf->page = page; 1420 vmf->page = page;
1390 ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; 1421 ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
1422out_inline:
1423 dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
1424 inode, off, (size_t)PAGE_SIZE, ret);
1391 } 1425 }
1392out: 1426out_restore:
1393 dout("filemap_fault %p %llu~%zd read inline data ret %d\n", 1427 ceph_restore_sigs(&oldset);
1394 inode, off, (size_t)PAGE_SIZE, ret); 1428 if (ret < 0)
1429 ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
1430
1395 return ret; 1431 return ret;
1396} 1432}
1397 1433
@@ -1409,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1409 loff_t size = i_size_read(inode); 1445 loff_t size = i_size_read(inode);
1410 size_t len; 1446 size_t len;
1411 int want, got, ret; 1447 int want, got, ret;
1448 sigset_t oldset;
1412 1449
1413 prealloc_cf = ceph_alloc_cap_flush(); 1450 prealloc_cf = ceph_alloc_cap_flush();
1414 if (!prealloc_cf) 1451 if (!prealloc_cf)
1415 return VM_FAULT_SIGBUS; 1452 return VM_FAULT_OOM;
1453
1454 ceph_block_sigs(&oldset);
1416 1455
1417 if (ci->i_inline_version != CEPH_INLINE_NONE) { 1456 if (ci->i_inline_version != CEPH_INLINE_NONE) {
1418 struct page *locked_page = NULL; 1457 struct page *locked_page = NULL;
@@ -1423,10 +1462,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1423 ret = ceph_uninline_data(vma->vm_file, locked_page); 1462 ret = ceph_uninline_data(vma->vm_file, locked_page);
1424 if (locked_page) 1463 if (locked_page)
1425 unlock_page(locked_page); 1464 unlock_page(locked_page);
1426 if (ret < 0) { 1465 if (ret < 0)
1427 ret = VM_FAULT_SIGBUS;
1428 goto out_free; 1466 goto out_free;
1429 }
1430 } 1467 }
1431 1468
1432 if (off + PAGE_SIZE <= size) 1469 if (off + PAGE_SIZE <= size)
@@ -1440,45 +1477,36 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1440 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; 1477 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
1441 else 1478 else
1442 want = CEPH_CAP_FILE_BUFFER; 1479 want = CEPH_CAP_FILE_BUFFER;
1443 while (1) { 1480
1444 got = 0; 1481 got = 0;
1445 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, 1482 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
1446 &got, NULL); 1483 &got, NULL);
1447 if (ret == 0) 1484 if (ret < 0)
1448 break; 1485 goto out_free;
1449 if (ret != -ERESTARTSYS) { 1486
1450 WARN_ON(1);
1451 ret = VM_FAULT_SIGBUS;
1452 goto out_free;
1453 }
1454 }
1455 dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", 1487 dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
1456 inode, off, len, ceph_cap_string(got)); 1488 inode, off, len, ceph_cap_string(got));
1457 1489
1458 /* Update time before taking page lock */ 1490 /* Update time before taking page lock */
1459 file_update_time(vma->vm_file); 1491 file_update_time(vma->vm_file);
1460 1492
1461 lock_page(page); 1493 do {
1494 lock_page(page);
1462 1495
1463 ret = VM_FAULT_NOPAGE; 1496 if ((off > size) || (page->mapping != inode->i_mapping)) {
1464 if ((off > size) || 1497 unlock_page(page);
1465 (page->mapping != inode->i_mapping)) { 1498 ret = VM_FAULT_NOPAGE;
1466 unlock_page(page); 1499 break;
1467 goto out; 1500 }
1468 } 1501
1502 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1503 if (ret >= 0) {
1504 /* success. we'll keep the page locked. */
1505 set_page_dirty(page);
1506 ret = VM_FAULT_LOCKED;
1507 }
1508 } while (ret == -EAGAIN);
1469 1509
1470 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1471 if (ret >= 0) {
1472 /* success. we'll keep the page locked. */
1473 set_page_dirty(page);
1474 ret = VM_FAULT_LOCKED;
1475 } else {
1476 if (ret == -ENOMEM)
1477 ret = VM_FAULT_OOM;
1478 else
1479 ret = VM_FAULT_SIGBUS;
1480 }
1481out:
1482 if (ret == VM_FAULT_LOCKED || 1510 if (ret == VM_FAULT_LOCKED ||
1483 ci->i_inline_version != CEPH_INLINE_NONE) { 1511 ci->i_inline_version != CEPH_INLINE_NONE) {
1484 int dirty; 1512 int dirty;
@@ -1495,8 +1523,10 @@ out:
1495 inode, off, len, ceph_cap_string(got), ret); 1523 inode, off, len, ceph_cap_string(got), ret);
1496 ceph_put_cap_refs(ci, got); 1524 ceph_put_cap_refs(ci, got);
1497out_free: 1525out_free:
1526 ceph_restore_sigs(&oldset);
1498 ceph_free_cap_flush(prealloc_cf); 1527 ceph_free_cap_flush(prealloc_cf);
1499 1528 if (ret < 0)
1529 ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
1500 return ret; 1530 return ret;
1501} 1531}
1502 1532
@@ -1614,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
1614 goto out; 1644 goto out;
1615 } 1645 }
1616 1646
1617 ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); 1647 req->r_mtime = inode->i_mtime;
1618 err = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1648 err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
1619 if (!err) 1649 if (!err)
1620 err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1650 err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1657,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
1657 goto out_put; 1687 goto out_put;
1658 } 1688 }
1659 1689
1660 ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); 1690 req->r_mtime = inode->i_mtime;
1661 err = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1691 err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
1662 if (!err) 1692 if (!err)
1663 err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1693 err = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1758,9 +1788,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
1758 rd_req->r_flags = CEPH_OSD_FLAG_READ; 1788 rd_req->r_flags = CEPH_OSD_FLAG_READ;
1759 osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); 1789 osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
1760 rd_req->r_base_oloc.pool = pool; 1790 rd_req->r_base_oloc.pool = pool;
1761 snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name), 1791 ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
1762 "%llx.00000000", ci->i_vino.ino); 1792
1763 rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); 1793 err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
1794 if (err)
1795 goto out_unlock;
1764 1796
1765 wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1797 wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
1766 1, false, GFP_NOFS); 1798 1, false, GFP_NOFS);
@@ -1769,11 +1801,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
1769 goto out_unlock; 1801 goto out_unlock;
1770 } 1802 }
1771 1803
1772 wr_req->r_flags = CEPH_OSD_FLAG_WRITE | 1804 wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK;
1773 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
1774 osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); 1805 osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
1775 wr_req->r_base_oloc.pool = pool; 1806 ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
1776 wr_req->r_base_oid = rd_req->r_base_oid; 1807 ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
1808
1809 err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
1810 if (err)
1811 goto out_unlock;
1777 1812
1778 /* one page should be large enough for STAT data */ 1813 /* one page should be large enough for STAT data */
1779 pages = ceph_alloc_page_vector(1, GFP_KERNEL); 1814 pages = ceph_alloc_page_vector(1, GFP_KERNEL);
@@ -1784,12 +1819,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
1784 1819
1785 osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, 1820 osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
1786 0, false, true); 1821 0, false, true);
1787 ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
1788 &ci->vfs_inode.i_mtime);
1789 err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); 1822 err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
1790 1823
1791 ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP, 1824 wr_req->r_mtime = ci->vfs_inode.i_mtime;
1792 &ci->vfs_inode.i_mtime);
1793 err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); 1825 err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
1794 1826
1795 if (!err) 1827 if (!err)
@@ -1823,10 +1855,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
1823out_unlock: 1855out_unlock:
1824 up_write(&mdsc->pool_perm_rwsem); 1856 up_write(&mdsc->pool_perm_rwsem);
1825 1857
1826 if (rd_req) 1858 ceph_osdc_put_request(rd_req);
1827 ceph_osdc_put_request(rd_req); 1859 ceph_osdc_put_request(wr_req);
1828 if (wr_req)
1829 ceph_osdc_put_request(wr_req);
1830out: 1860out:
1831 if (!err) 1861 if (!err)
1832 err = have; 1862 err = have;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index a351480dbabc..c052b5bf219b 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -236,7 +236,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int
236 unlock_page(page); 236 unlock_page(page);
237} 237}
238 238
239static inline int cache_valid(struct ceph_inode_info *ci) 239static inline bool cache_valid(struct ceph_inode_info *ci)
240{ 240{
241 return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && 241 return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
242 (ci->i_fscache_gen == ci->i_rdcache_gen)); 242 (ci->i_fscache_gen == ci->i_rdcache_gen));
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index cfaeef18cbca..c17b5d76d75e 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1656,7 +1656,7 @@ retry_locked:
1656 */ 1656 */
1657 if ((!is_delayed || mdsc->stopping) && 1657 if ((!is_delayed || mdsc->stopping) &&
1658 !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ 1658 !S_ISDIR(inode->i_mode) && /* ignore readdir cache */
1659 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ 1659 !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
1660 inode->i_data.nrpages && /* have cached pages */ 1660 inode->i_data.nrpages && /* have cached pages */
1661 (revoking & (CEPH_CAP_FILE_CACHE| 1661 (revoking & (CEPH_CAP_FILE_CACHE|
1662 CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ 1662 CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
@@ -1698,8 +1698,8 @@ retry_locked:
1698 1698
1699 revoking = cap->implemented & ~cap->issued; 1699 revoking = cap->implemented & ~cap->issued;
1700 dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", 1700 dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
1701 cap->mds, cap, ceph_cap_string(cap->issued), 1701 cap->mds, cap, ceph_cap_string(cap_used),
1702 ceph_cap_string(cap_used), 1702 ceph_cap_string(cap->issued),
1703 ceph_cap_string(cap->implemented), 1703 ceph_cap_string(cap->implemented),
1704 ceph_cap_string(revoking)); 1704 ceph_cap_string(revoking));
1705 1705
@@ -2317,7 +2317,7 @@ again:
2317 2317
2318 /* make sure file is actually open */ 2318 /* make sure file is actually open */
2319 file_wanted = __ceph_caps_file_wanted(ci); 2319 file_wanted = __ceph_caps_file_wanted(ci);
2320 if ((file_wanted & need) == 0) { 2320 if ((file_wanted & need) != need) {
2321 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", 2321 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
2322 ceph_cap_string(need), ceph_cap_string(file_wanted)); 2322 ceph_cap_string(need), ceph_cap_string(file_wanted));
2323 *err = -EBADF; 2323 *err = -EBADF;
@@ -2412,12 +2412,26 @@ again:
2412 goto out_unlock; 2412 goto out_unlock;
2413 } 2413 }
2414 2414
2415 if (!__ceph_is_any_caps(ci) && 2415 if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) {
2416 ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { 2416 int mds_wanted;
2417 dout("get_cap_refs %p forced umount\n", inode); 2417 if (ACCESS_ONCE(mdsc->fsc->mount_state) ==
2418 *err = -EIO; 2418 CEPH_MOUNT_SHUTDOWN) {
2419 ret = 1; 2419 dout("get_cap_refs %p forced umount\n", inode);
2420 goto out_unlock; 2420 *err = -EIO;
2421 ret = 1;
2422 goto out_unlock;
2423 }
2424 mds_wanted = __ceph_caps_mds_wanted(ci);
2425 if ((mds_wanted & need) != need) {
2426 dout("get_cap_refs %p caps were dropped"
2427 " (session killed?)\n", inode);
2428 *err = -ESTALE;
2429 ret = 1;
2430 goto out_unlock;
2431 }
2432 if ((mds_wanted & file_wanted) ==
2433 (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
2434 ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED;
2421 } 2435 }
2422 2436
2423 dout("get_cap_refs %p have %s needed %s\n", inode, 2437 dout("get_cap_refs %p have %s needed %s\n", inode,
@@ -2487,7 +2501,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2487 if (err == -EAGAIN) 2501 if (err == -EAGAIN)
2488 continue; 2502 continue;
2489 if (err < 0) 2503 if (err < 0)
2490 return err; 2504 ret = err;
2491 } else { 2505 } else {
2492 ret = wait_event_interruptible(ci->i_cap_wq, 2506 ret = wait_event_interruptible(ci->i_cap_wq,
2493 try_get_cap_refs(ci, need, want, endoff, 2507 try_get_cap_refs(ci, need, want, endoff,
@@ -2496,8 +2510,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2496 continue; 2510 continue;
2497 if (err < 0) 2511 if (err < 0)
2498 ret = err; 2512 ret = err;
2499 if (ret < 0) 2513 }
2500 return ret; 2514 if (ret < 0) {
2515 if (err == -ESTALE) {
2516 /* session was killed, try renew caps */
2517 ret = ceph_renew_caps(&ci->vfs_inode);
2518 if (ret == 0)
2519 continue;
2520 }
2521 return ret;
2501 } 2522 }
2502 2523
2503 if (ci->i_inline_version != CEPH_INLINE_NONE && 2524 if (ci->i_inline_version != CEPH_INLINE_NONE &&
@@ -2807,7 +2828,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
2807 if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ 2828 if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
2808 ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && 2829 ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2809 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && 2830 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2810 !ci->i_wrbuffer_ref) { 2831 !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
2811 if (try_nonblocking_invalidate(inode)) { 2832 if (try_nonblocking_invalidate(inode)) {
2812 /* there were locked pages.. invalidate later 2833 /* there were locked pages.. invalidate later
2813 in a separate thread. */ 2834 in a separate thread. */
@@ -3226,6 +3247,8 @@ retry:
3226 3247
3227 if (target < 0) { 3248 if (target < 0) {
3228 __ceph_remove_cap(cap, false); 3249 __ceph_remove_cap(cap, false);
3250 if (!ci->i_auth_cap)
3251 ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
3229 goto out_unlock; 3252 goto out_unlock;
3230 } 3253 }
3231 3254
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 31f831471ed2..39ff678e567f 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p)
109 path ? path : ""); 109 path ? path : "");
110 spin_unlock(&req->r_old_dentry->d_lock); 110 spin_unlock(&req->r_old_dentry->d_lock);
111 kfree(path); 111 kfree(path);
112 } else if (req->r_path2) { 112 } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) {
113 if (req->r_ino2.ino) 113 if (req->r_ino2.ino)
114 seq_printf(s, " #%llx/%s", req->r_ino2.ino, 114 seq_printf(s, " #%llx/%s", req->r_ino2.ino,
115 req->r_path2); 115 req->r_path2);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 3ab1192d2029..6e0fedf6713b 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -70,16 +70,42 @@ out_unlock:
70} 70}
71 71
72/* 72/*
73 * for readdir, we encode the directory frag and offset within that 73 * for f_pos for readdir:
74 * frag into f_pos. 74 * - hash order:
75 * (0xff << 52) | ((24 bits hash) << 28) |
76 * (the nth entry has hash collision);
77 * - frag+name order;
78 * ((frag value) << 28) | (the nth entry in frag);
75 */ 79 */
80#define OFFSET_BITS 28
81#define OFFSET_MASK ((1 << OFFSET_BITS) - 1)
82#define HASH_ORDER (0xffull << (OFFSET_BITS + 24))
83loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order)
84{
85 loff_t fpos = ((loff_t)high << 28) | (loff_t)off;
86 if (hash_order)
87 fpos |= HASH_ORDER;
88 return fpos;
89}
90
91static bool is_hash_order(loff_t p)
92{
93 return (p & HASH_ORDER) == HASH_ORDER;
94}
95
76static unsigned fpos_frag(loff_t p) 96static unsigned fpos_frag(loff_t p)
77{ 97{
78 return p >> 32; 98 return p >> OFFSET_BITS;
79} 99}
100
101static unsigned fpos_hash(loff_t p)
102{
103 return ceph_frag_value(fpos_frag(p));
104}
105
80static unsigned fpos_off(loff_t p) 106static unsigned fpos_off(loff_t p)
81{ 107{
82 return p & 0xffffffff; 108 return p & OFFSET_MASK;
83} 109}
84 110
85static int fpos_cmp(loff_t l, loff_t r) 111static int fpos_cmp(loff_t l, loff_t r)
@@ -111,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name,
111 return 0; 137 return 0;
112} 138}
113 139
140
141static struct dentry *
142__dcache_find_get_entry(struct dentry *parent, u64 idx,
143 struct ceph_readdir_cache_control *cache_ctl)
144{
145 struct inode *dir = d_inode(parent);
146 struct dentry *dentry;
147 unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1;
148 loff_t ptr_pos = idx * sizeof(struct dentry *);
149 pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT;
150
151 if (ptr_pos >= i_size_read(dir))
152 return NULL;
153
154 if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) {
155 ceph_readdir_cache_release(cache_ctl);
156 cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
157 if (!cache_ctl->page) {
158 dout(" page %lu not found\n", ptr_pgoff);
159 return ERR_PTR(-EAGAIN);
160 }
161 /* reading/filling the cache are serialized by
162 i_mutex, no need to use page lock */
163 unlock_page(cache_ctl->page);
164 cache_ctl->dentries = kmap(cache_ctl->page);
165 }
166
167 cache_ctl->index = idx & idx_mask;
168
169 rcu_read_lock();
170 spin_lock(&parent->d_lock);
171 /* check i_size again here, because empty directory can be
172 * marked as complete while not holding the i_mutex. */
173 if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
174 dentry = cache_ctl->dentries[cache_ctl->index];
175 else
176 dentry = NULL;
177 spin_unlock(&parent->d_lock);
178 if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
179 dentry = NULL;
180 rcu_read_unlock();
181 return dentry ? : ERR_PTR(-EAGAIN);
182}
183
114/* 184/*
115 * When possible, we try to satisfy a readdir by peeking at the 185 * When possible, we try to satisfy a readdir by peeking at the
116 * dcache. We make this work by carefully ordering dentries on 186 * dcache. We make this work by carefully ordering dentries on
@@ -130,75 +200,68 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
130 struct inode *dir = d_inode(parent); 200 struct inode *dir = d_inode(parent);
131 struct dentry *dentry, *last = NULL; 201 struct dentry *dentry, *last = NULL;
132 struct ceph_dentry_info *di; 202 struct ceph_dentry_info *di;
133 unsigned nsize = PAGE_SIZE / sizeof(struct dentry *);
134 int err = 0;
135 loff_t ptr_pos = 0;
136 struct ceph_readdir_cache_control cache_ctl = {}; 203 struct ceph_readdir_cache_control cache_ctl = {};
204 u64 idx = 0;
205 int err = 0;
137 206
138 dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); 207 dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos);
208
209 /* search start position */
210 if (ctx->pos > 2) {
211 u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *));
212 while (count > 0) {
213 u64 step = count >> 1;
214 dentry = __dcache_find_get_entry(parent, idx + step,
215 &cache_ctl);
216 if (!dentry) {
217 /* use linar search */
218 idx = 0;
219 break;
220 }
221 if (IS_ERR(dentry)) {
222 err = PTR_ERR(dentry);
223 goto out;
224 }
225 di = ceph_dentry(dentry);
226 spin_lock(&dentry->d_lock);
227 if (fpos_cmp(di->offset, ctx->pos) < 0) {
228 idx += step + 1;
229 count -= step + 1;
230 } else {
231 count = step;
232 }
233 spin_unlock(&dentry->d_lock);
234 dput(dentry);
235 }
139 236
140 /* we can calculate cache index for the first dirfrag */ 237 dout("__dcache_readdir %p cache idx %llu\n", dir, idx);
141 if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
142 cache_ctl.index = fpos_off(ctx->pos) - 2;
143 BUG_ON(cache_ctl.index < 0);
144 ptr_pos = cache_ctl.index * sizeof(struct dentry *);
145 } 238 }
146 239
147 while (true) {
148 pgoff_t pgoff;
149 bool emit_dentry;
150 240
151 if (ptr_pos >= i_size_read(dir)) { 241 for (;;) {
242 bool emit_dentry = false;
243 dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl);
244 if (!dentry) {
152 fi->flags |= CEPH_F_ATEND; 245 fi->flags |= CEPH_F_ATEND;
153 err = 0; 246 err = 0;
154 break; 247 break;
155 } 248 }
156 249 if (IS_ERR(dentry)) {
157 err = -EAGAIN; 250 err = PTR_ERR(dentry);
158 pgoff = ptr_pos >> PAGE_SHIFT; 251 goto out;
159 if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
160 ceph_readdir_cache_release(&cache_ctl);
161 cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
162 if (!cache_ctl.page) {
163 dout(" page %lu not found\n", pgoff);
164 break;
165 }
166 /* reading/filling the cache are serialized by
167 * i_mutex, no need to use page lock */
168 unlock_page(cache_ctl.page);
169 cache_ctl.dentries = kmap(cache_ctl.page);
170 } 252 }
171 253
172 rcu_read_lock();
173 spin_lock(&parent->d_lock);
174 /* check i_size again here, because empty directory can be
175 * marked as complete while not holding the i_mutex. */
176 if (ceph_dir_is_complete_ordered(dir) &&
177 ptr_pos < i_size_read(dir))
178 dentry = cache_ctl.dentries[cache_ctl.index % nsize];
179 else
180 dentry = NULL;
181 spin_unlock(&parent->d_lock);
182 if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
183 dentry = NULL;
184 rcu_read_unlock();
185 if (!dentry)
186 break;
187
188 emit_dentry = false;
189 di = ceph_dentry(dentry); 254 di = ceph_dentry(dentry);
190 spin_lock(&dentry->d_lock); 255 spin_lock(&dentry->d_lock);
191 if (di->lease_shared_gen == shared_gen && 256 if (di->lease_shared_gen == shared_gen &&
192 d_really_is_positive(dentry) && 257 d_really_is_positive(dentry) &&
193 ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
194 ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
195 fpos_cmp(ctx->pos, di->offset) <= 0) { 258 fpos_cmp(ctx->pos, di->offset) <= 0) {
196 emit_dentry = true; 259 emit_dentry = true;
197 } 260 }
198 spin_unlock(&dentry->d_lock); 261 spin_unlock(&dentry->d_lock);
199 262
200 if (emit_dentry) { 263 if (emit_dentry) {
201 dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, 264 dout(" %llx dentry %p %pd %p\n", di->offset,
202 dentry, dentry, d_inode(dentry)); 265 dentry, dentry, d_inode(dentry));
203 ctx->pos = di->offset; 266 ctx->pos = di->offset;
204 if (!dir_emit(ctx, dentry->d_name.name, 267 if (!dir_emit(ctx, dentry->d_name.name,
@@ -218,10 +281,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
218 } else { 281 } else {
219 dput(dentry); 282 dput(dentry);
220 } 283 }
221
222 cache_ctl.index++;
223 ptr_pos += sizeof(struct dentry *);
224 } 284 }
285out:
225 ceph_readdir_cache_release(&cache_ctl); 286 ceph_readdir_cache_release(&cache_ctl);
226 if (last) { 287 if (last) {
227 int ret; 288 int ret;
@@ -235,6 +296,16 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
235 return err; 296 return err;
236} 297}
237 298
299static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos)
300{
301 if (!fi->last_readdir)
302 return true;
303 if (is_hash_order(pos))
304 return !ceph_frag_contains_value(fi->frag, fpos_hash(pos));
305 else
306 return fi->frag != fpos_frag(pos);
307}
308
238static int ceph_readdir(struct file *file, struct dir_context *ctx) 309static int ceph_readdir(struct file *file, struct dir_context *ctx)
239{ 310{
240 struct ceph_file_info *fi = file->private_data; 311 struct ceph_file_info *fi = file->private_data;
@@ -242,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
242 struct ceph_inode_info *ci = ceph_inode(inode); 313 struct ceph_inode_info *ci = ceph_inode(inode);
243 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 314 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
244 struct ceph_mds_client *mdsc = fsc->mdsc; 315 struct ceph_mds_client *mdsc = fsc->mdsc;
245 unsigned frag = fpos_frag(ctx->pos); 316 int i;
246 int off = fpos_off(ctx->pos);
247 int err; 317 int err;
248 u32 ftype; 318 u32 ftype;
249 struct ceph_mds_reply_info_parsed *rinfo; 319 struct ceph_mds_reply_info_parsed *rinfo;
250 320
251 dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); 321 dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
252 if (fi->flags & CEPH_F_ATEND) 322 if (fi->flags & CEPH_F_ATEND)
253 return 0; 323 return 0;
254 324
@@ -260,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
260 inode->i_mode >> 12)) 330 inode->i_mode >> 12))
261 return 0; 331 return 0;
262 ctx->pos = 1; 332 ctx->pos = 1;
263 off = 1;
264 } 333 }
265 if (ctx->pos == 1) { 334 if (ctx->pos == 1) {
266 ino_t ino = parent_ino(file->f_path.dentry); 335 ino_t ino = parent_ino(file->f_path.dentry);
@@ -270,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
270 inode->i_mode >> 12)) 339 inode->i_mode >> 12))
271 return 0; 340 return 0;
272 ctx->pos = 2; 341 ctx->pos = 2;
273 off = 2;
274 } 342 }
275 343
276 /* can we use the dcache? */ 344 /* can we use the dcache? */
@@ -285,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
285 err = __dcache_readdir(file, ctx, shared_gen); 353 err = __dcache_readdir(file, ctx, shared_gen);
286 if (err != -EAGAIN) 354 if (err != -EAGAIN)
287 return err; 355 return err;
288 frag = fpos_frag(ctx->pos);
289 off = fpos_off(ctx->pos);
290 } else { 356 } else {
291 spin_unlock(&ci->i_ceph_lock); 357 spin_unlock(&ci->i_ceph_lock);
292 } 358 }
@@ -294,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
294 /* proceed with a normal readdir */ 360 /* proceed with a normal readdir */
295more: 361more:
296 /* do we have the correct frag content buffered? */ 362 /* do we have the correct frag content buffered? */
297 if (fi->frag != frag || fi->last_readdir == NULL) { 363 if (need_send_readdir(fi, ctx->pos)) {
298 struct ceph_mds_request *req; 364 struct ceph_mds_request *req;
365 unsigned frag;
299 int op = ceph_snap(inode) == CEPH_SNAPDIR ? 366 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
300 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; 367 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
301 368
@@ -305,6 +372,13 @@ more:
305 fi->last_readdir = NULL; 372 fi->last_readdir = NULL;
306 } 373 }
307 374
375 if (is_hash_order(ctx->pos)) {
376 frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
377 NULL, NULL);
378 } else {
379 frag = fpos_frag(ctx->pos);
380 }
381
308 dout("readdir fetching %llx.%llx frag %x offset '%s'\n", 382 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
309 ceph_vinop(inode), frag, fi->last_name); 383 ceph_vinop(inode), frag, fi->last_name);
310 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 384 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -331,6 +405,8 @@ more:
331 req->r_readdir_cache_idx = fi->readdir_cache_idx; 405 req->r_readdir_cache_idx = fi->readdir_cache_idx;
332 req->r_readdir_offset = fi->next_offset; 406 req->r_readdir_offset = fi->next_offset;
333 req->r_args.readdir.frag = cpu_to_le32(frag); 407 req->r_args.readdir.frag = cpu_to_le32(frag);
408 req->r_args.readdir.flags =
409 cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS);
334 410
335 req->r_inode = inode; 411 req->r_inode = inode;
336 ihold(inode); 412 ihold(inode);
@@ -340,22 +416,26 @@ more:
340 ceph_mdsc_put_request(req); 416 ceph_mdsc_put_request(req);
341 return err; 417 return err;
342 } 418 }
343 dout("readdir got and parsed readdir result=%d" 419 dout("readdir got and parsed readdir result=%d on "
344 " on frag %x, end=%d, complete=%d\n", err, frag, 420 "frag %x, end=%d, complete=%d, hash_order=%d\n",
421 err, frag,
345 (int)req->r_reply_info.dir_end, 422 (int)req->r_reply_info.dir_end,
346 (int)req->r_reply_info.dir_complete); 423 (int)req->r_reply_info.dir_complete,
347 424 (int)req->r_reply_info.hash_order);
348 425
349 /* note next offset and last dentry name */
350 rinfo = &req->r_reply_info; 426 rinfo = &req->r_reply_info;
351 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { 427 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
352 frag = le32_to_cpu(rinfo->dir_dir->frag); 428 frag = le32_to_cpu(rinfo->dir_dir->frag);
353 off = req->r_readdir_offset; 429 if (!rinfo->hash_order) {
354 fi->next_offset = off; 430 fi->next_offset = req->r_readdir_offset;
431 /* adjust ctx->pos to beginning of frag */
432 ctx->pos = ceph_make_fpos(frag,
433 fi->next_offset,
434 false);
435 }
355 } 436 }
356 437
357 fi->frag = frag; 438 fi->frag = frag;
358 fi->offset = fi->next_offset;
359 fi->last_readdir = req; 439 fi->last_readdir = req;
360 440
361 if (req->r_did_prepopulate) { 441 if (req->r_did_prepopulate) {
@@ -363,7 +443,8 @@ more:
363 if (fi->readdir_cache_idx < 0) { 443 if (fi->readdir_cache_idx < 0) {
364 /* preclude from marking dir ordered */ 444 /* preclude from marking dir ordered */
365 fi->dir_ordered_count = 0; 445 fi->dir_ordered_count = 0;
366 } else if (ceph_frag_is_leftmost(frag) && off == 2) { 446 } else if (ceph_frag_is_leftmost(frag) &&
447 fi->next_offset == 2) {
367 /* note dir version at start of readdir so 448 /* note dir version at start of readdir so
368 * we can tell if any dentries get dropped */ 449 * we can tell if any dentries get dropped */
369 fi->dir_release_count = req->r_dir_release_cnt; 450 fi->dir_release_count = req->r_dir_release_cnt;
@@ -377,65 +458,87 @@ more:
377 fi->dir_release_count = 0; 458 fi->dir_release_count = 0;
378 } 459 }
379 460
380 if (req->r_reply_info.dir_end) { 461 /* note next offset and last dentry name */
381 kfree(fi->last_name); 462 if (rinfo->dir_nr > 0) {
382 fi->last_name = NULL; 463 struct ceph_mds_reply_dir_entry *rde =
383 if (ceph_frag_is_rightmost(frag)) 464 rinfo->dir_entries + (rinfo->dir_nr-1);
384 fi->next_offset = 2; 465 unsigned next_offset = req->r_reply_info.dir_end ?
385 else 466 2 : (fpos_off(rde->offset) + 1);
386 fi->next_offset = 0; 467 err = note_last_dentry(fi, rde->name, rde->name_len,
387 } else { 468 next_offset);
388 err = note_last_dentry(fi,
389 rinfo->dir_dname[rinfo->dir_nr-1],
390 rinfo->dir_dname_len[rinfo->dir_nr-1],
391 fi->next_offset + rinfo->dir_nr);
392 if (err) 469 if (err)
393 return err; 470 return err;
471 } else if (req->r_reply_info.dir_end) {
472 fi->next_offset = 2;
473 /* keep last name */
394 } 474 }
395 } 475 }
396 476
397 rinfo = &fi->last_readdir->r_reply_info; 477 rinfo = &fi->last_readdir->r_reply_info;
398 dout("readdir frag %x num %d off %d chunkoff %d\n", frag, 478 dout("readdir frag %x num %d pos %llx chunk first %llx\n",
399 rinfo->dir_nr, off, fi->offset); 479 fi->frag, rinfo->dir_nr, ctx->pos,
400 480 rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL);
401 ctx->pos = ceph_make_fpos(frag, off); 481
402 while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { 482 i = 0;
403 struct ceph_mds_reply_inode *in = 483 /* search start position */
404 rinfo->dir_in[off - fi->offset].in; 484 if (rinfo->dir_nr > 0) {
485 int step, nr = rinfo->dir_nr;
486 while (nr > 0) {
487 step = nr >> 1;
488 if (rinfo->dir_entries[i + step].offset < ctx->pos) {
489 i += step + 1;
490 nr -= step + 1;
491 } else {
492 nr = step;
493 }
494 }
495 }
496 for (; i < rinfo->dir_nr; i++) {
497 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
405 struct ceph_vino vino; 498 struct ceph_vino vino;
406 ino_t ino; 499 ino_t ino;
407 500
408 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", 501 BUG_ON(rde->offset < ctx->pos);
409 off, off - fi->offset, rinfo->dir_nr, ctx->pos, 502
410 rinfo->dir_dname_len[off - fi->offset], 503 ctx->pos = rde->offset;
411 rinfo->dir_dname[off - fi->offset], in); 504 dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
412 BUG_ON(!in); 505 i, rinfo->dir_nr, ctx->pos,
413 ftype = le32_to_cpu(in->mode) >> 12; 506 rde->name_len, rde->name, &rde->inode.in);
414 vino.ino = le64_to_cpu(in->ino); 507
415 vino.snap = le64_to_cpu(in->snapid); 508 BUG_ON(!rde->inode.in);
509 ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
510 vino.ino = le64_to_cpu(rde->inode.in->ino);
511 vino.snap = le64_to_cpu(rde->inode.in->snapid);
416 ino = ceph_vino_to_ino(vino); 512 ino = ceph_vino_to_ino(vino);
417 if (!dir_emit(ctx, 513
418 rinfo->dir_dname[off - fi->offset], 514 if (!dir_emit(ctx, rde->name, rde->name_len,
419 rinfo->dir_dname_len[off - fi->offset], 515 ceph_translate_ino(inode->i_sb, ino), ftype)) {
420 ceph_translate_ino(inode->i_sb, ino), ftype)) {
421 dout("filldir stopping us...\n"); 516 dout("filldir stopping us...\n");
422 return 0; 517 return 0;
423 } 518 }
424 off++;
425 ctx->pos++; 519 ctx->pos++;
426 } 520 }
427 521
428 if (fi->last_name) { 522 if (fi->next_offset > 2) {
429 ceph_mdsc_put_request(fi->last_readdir); 523 ceph_mdsc_put_request(fi->last_readdir);
430 fi->last_readdir = NULL; 524 fi->last_readdir = NULL;
431 goto more; 525 goto more;
432 } 526 }
433 527
434 /* more frags? */ 528 /* more frags? */
435 if (!ceph_frag_is_rightmost(frag)) { 529 if (!ceph_frag_is_rightmost(fi->frag)) {
436 frag = ceph_frag_next(frag); 530 unsigned frag = ceph_frag_next(fi->frag);
437 off = 0; 531 if (is_hash_order(ctx->pos)) {
438 ctx->pos = ceph_make_fpos(frag, off); 532 loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
533 fi->next_offset, true);
534 if (new_pos > ctx->pos)
535 ctx->pos = new_pos;
536 /* keep last_name */
537 } else {
538 ctx->pos = ceph_make_fpos(frag, fi->next_offset, false);
539 kfree(fi->last_name);
540 fi->last_name = NULL;
541 }
439 dout("readdir next frag is %x\n", frag); 542 dout("readdir next frag is %x\n", frag);
440 goto more; 543 goto more;
441 } 544 }
@@ -467,7 +570,7 @@ more:
467 return 0; 570 return 0;
468} 571}
469 572
470static void reset_readdir(struct ceph_file_info *fi, unsigned frag) 573static void reset_readdir(struct ceph_file_info *fi)
471{ 574{
472 if (fi->last_readdir) { 575 if (fi->last_readdir) {
473 ceph_mdsc_put_request(fi->last_readdir); 576 ceph_mdsc_put_request(fi->last_readdir);
@@ -477,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
477 fi->last_name = NULL; 580 fi->last_name = NULL;
478 fi->dir_release_count = 0; 581 fi->dir_release_count = 0;
479 fi->readdir_cache_idx = -1; 582 fi->readdir_cache_idx = -1;
480 if (ceph_frag_is_leftmost(frag)) 583 fi->next_offset = 2; /* compensate for . and .. */
481 fi->next_offset = 2; /* compensate for . and .. */
482 else
483 fi->next_offset = 0;
484 fi->flags &= ~CEPH_F_ATEND; 584 fi->flags &= ~CEPH_F_ATEND;
485} 585}
486 586
587/*
588 * discard buffered readdir content on seekdir(0), or seek to new frag,
589 * or seek prior to current chunk
590 */
591static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
592{
593 struct ceph_mds_reply_info_parsed *rinfo;
594 loff_t chunk_offset;
595 if (new_pos == 0)
596 return true;
597 if (is_hash_order(new_pos)) {
598 /* no need to reset last_name for a forward seek when
599 * dentries are sotred in hash order */
600 } else if (fi->frag |= fpos_frag(new_pos)) {
601 return true;
602 }
603 rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
604 if (!rinfo || !rinfo->dir_nr)
605 return true;
606 chunk_offset = rinfo->dir_entries[0].offset;
607 return new_pos < chunk_offset ||
608 is_hash_order(new_pos) != is_hash_order(chunk_offset);
609}
610
487static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) 611static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
488{ 612{
489 struct ceph_file_info *fi = file->private_data; 613 struct ceph_file_info *fi = file->private_data;
490 struct inode *inode = file->f_mapping->host; 614 struct inode *inode = file->f_mapping->host;
491 loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
492 loff_t retval; 615 loff_t retval;
493 616
494 inode_lock(inode); 617 inode_lock(inode);
@@ -505,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
505 } 628 }
506 629
507 if (offset >= 0) { 630 if (offset >= 0) {
631 if (need_reset_readdir(fi, offset)) {
632 dout("dir_llseek dropping %p content\n", file);
633 reset_readdir(fi);
634 } else if (is_hash_order(offset) && offset > file->f_pos) {
635 /* for hash offset, we don't know if a forward seek
636 * is within same frag */
637 fi->dir_release_count = 0;
638 fi->readdir_cache_idx = -1;
639 }
640
508 if (offset != file->f_pos) { 641 if (offset != file->f_pos) {
509 file->f_pos = offset; 642 file->f_pos = offset;
510 file->f_version = 0; 643 file->f_version = 0;
511 fi->flags &= ~CEPH_F_ATEND; 644 fi->flags &= ~CEPH_F_ATEND;
512 } 645 }
513 retval = offset; 646 retval = offset;
514
515 if (offset == 0 ||
516 fpos_frag(offset) != fi->frag ||
517 fpos_off(offset) < fi->offset) {
518 /* discard buffered readdir content on seekdir(0), or
519 * seek to new frag, or seek prior to current chunk */
520 dout("dir_llseek dropping %p content\n", file);
521 reset_readdir(fi, fpos_frag(offset));
522 } else if (fpos_cmp(offset, old_offset) > 0) {
523 /* reset dir_release_count if we did a forward seek */
524 fi->dir_release_count = 0;
525 fi->readdir_cache_idx = -1;
526 }
527 } 647 }
528out: 648out:
529 inode_unlock(inode); 649 inode_unlock(inode);
@@ -591,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
591 return dentry; 711 return dentry;
592} 712}
593 713
594static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) 714static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
595{ 715{
596 return ceph_ino(inode) == CEPH_INO_ROOT && 716 return ceph_ino(inode) == CEPH_INO_ROOT &&
597 strncmp(dentry->d_name.name, ".ceph", 5) == 0; 717 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 4f1dc7120916..a888df6f2d71 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -192,6 +192,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
192} 192}
193 193
194/* 194/*
195 * try renew caps after session gets killed.
196 */
197int ceph_renew_caps(struct inode *inode)
198{
199 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
200 struct ceph_inode_info *ci = ceph_inode(inode);
201 struct ceph_mds_request *req;
202 int err, flags, wanted;
203
204 spin_lock(&ci->i_ceph_lock);
205 wanted = __ceph_caps_file_wanted(ci);
206 if (__ceph_is_any_real_caps(ci) &&
207 (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) {
208 int issued = __ceph_caps_issued(ci, NULL);
209 spin_unlock(&ci->i_ceph_lock);
210 dout("renew caps %p want %s issued %s updating mds_wanted\n",
211 inode, ceph_cap_string(wanted), ceph_cap_string(issued));
212 ceph_check_caps(ci, 0, NULL);
213 return 0;
214 }
215 spin_unlock(&ci->i_ceph_lock);
216
217 flags = 0;
218 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
219 flags = O_RDWR;
220 else if (wanted & CEPH_CAP_FILE_RD)
221 flags = O_RDONLY;
222 else if (wanted & CEPH_CAP_FILE_WR)
223 flags = O_WRONLY;
224#ifdef O_LAZY
225 if (wanted & CEPH_CAP_FILE_LAZYIO)
226 flags |= O_LAZY;
227#endif
228
229 req = prepare_open_request(inode->i_sb, flags, 0);
230 if (IS_ERR(req)) {
231 err = PTR_ERR(req);
232 goto out;
233 }
234
235 req->r_inode = inode;
236 ihold(inode);
237 req->r_num_caps = 1;
238 req->r_fmode = -1;
239
240 err = ceph_mdsc_do_request(mdsc, NULL, req);
241 ceph_mdsc_put_request(req);
242out:
243 dout("renew caps %p open result=%d\n", inode, err);
244 return err < 0 ? err : 0;
245}
246
247/*
195 * If we already have the requisite capabilities, we can satisfy 248 * If we already have the requisite capabilities, we can satisfy
196 * the open request locally (no need to request new caps from the 249 * the open request locally (no need to request new caps from the
197 * MDS). We do, however, need to inform the MDS (asynchronously) 250 * MDS). We do, however, need to inform the MDS (asynchronously)
@@ -616,8 +669,7 @@ static void ceph_aio_complete(struct inode *inode,
616 kfree(aio_req); 669 kfree(aio_req);
617} 670}
618 671
619static void ceph_aio_complete_req(struct ceph_osd_request *req, 672static void ceph_aio_complete_req(struct ceph_osd_request *req)
620 struct ceph_msg *msg)
621{ 673{
622 int rc = req->r_result; 674 int rc = req->r_result;
623 struct inode *inode = req->r_inode; 675 struct inode *inode = req->r_inode;
@@ -714,14 +766,21 @@ static void ceph_aio_retry_work(struct work_struct *work)
714 req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | 766 req->r_flags = CEPH_OSD_FLAG_ORDERSNAP |
715 CEPH_OSD_FLAG_ONDISK | 767 CEPH_OSD_FLAG_ONDISK |
716 CEPH_OSD_FLAG_WRITE; 768 CEPH_OSD_FLAG_WRITE;
717 req->r_base_oloc = orig_req->r_base_oloc; 769 ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
718 req->r_base_oid = orig_req->r_base_oid; 770 ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
771
772 ret = ceph_osdc_alloc_messages(req, GFP_NOFS);
773 if (ret) {
774 ceph_osdc_put_request(req);
775 req = orig_req;
776 goto out;
777 }
719 778
720 req->r_ops[0] = orig_req->r_ops[0]; 779 req->r_ops[0] = orig_req->r_ops[0];
721 osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); 780 osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
722 781
723 ceph_osdc_build_request(req, req->r_ops[0].extent.offset, 782 req->r_mtime = aio_req->mtime;
724 snapc, CEPH_NOSNAP, &aio_req->mtime); 783 req->r_data_offset = req->r_ops[0].extent.offset;
725 784
726 ceph_osdc_put_request(orig_req); 785 ceph_osdc_put_request(orig_req);
727 786
@@ -733,7 +792,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
733out: 792out:
734 if (ret < 0) { 793 if (ret < 0) {
735 req->r_result = ret; 794 req->r_result = ret;
736 ceph_aio_complete_req(req, NULL); 795 ceph_aio_complete_req(req);
737 } 796 }
738 797
739 ceph_put_snap_context(snapc); 798 ceph_put_snap_context(snapc);
@@ -764,6 +823,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
764 list_add_tail(&req->r_unsafe_item, 823 list_add_tail(&req->r_unsafe_item,
765 &ci->i_unsafe_writes); 824 &ci->i_unsafe_writes);
766 spin_unlock(&ci->i_unsafe_lock); 825 spin_unlock(&ci->i_unsafe_lock);
826
827 complete_all(&req->r_completion);
767 } else { 828 } else {
768 spin_lock(&ci->i_unsafe_lock); 829 spin_lock(&ci->i_unsafe_lock);
769 list_del_init(&req->r_unsafe_item); 830 list_del_init(&req->r_unsafe_item);
@@ -875,14 +936,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
875 (pos+len) | (PAGE_SIZE - 1)); 936 (pos+len) | (PAGE_SIZE - 1));
876 937
877 osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); 938 osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
939 req->r_mtime = mtime;
878 } 940 }
879 941
880
881 osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, 942 osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
882 false, false); 943 false, false);
883 944
884 ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
885
886 if (aio_req) { 945 if (aio_req) {
887 aio_req->total_len += len; 946 aio_req->total_len += len;
888 aio_req->num_reqs++; 947 aio_req->num_reqs++;
@@ -956,7 +1015,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
956 req, false); 1015 req, false);
957 if (ret < 0) { 1016 if (ret < 0) {
958 req->r_result = ret; 1017 req->r_result = ret;
959 ceph_aio_complete_req(req, NULL); 1018 ceph_aio_complete_req(req);
960 } 1019 }
961 } 1020 }
962 return -EIOCBQUEUED; 1021 return -EIOCBQUEUED;
@@ -1067,9 +1126,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
1067 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, 1126 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
1068 false, true); 1127 false, true);
1069 1128
1070 /* BUG_ON(vino.snap != CEPH_NOSNAP); */ 1129 req->r_mtime = mtime;
1071 ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
1072
1073 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1130 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
1074 if (!ret) 1131 if (!ret)
1075 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1132 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
@@ -1524,9 +1581,7 @@ static int ceph_zero_partial_object(struct inode *inode,
1524 goto out; 1581 goto out;
1525 } 1582 }
1526 1583
1527 ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, 1584 req->r_mtime = inode->i_mtime;
1528 &inode->i_mtime);
1529
1530 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1585 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
1531 if (!ret) { 1586 if (!ret) {
1532 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1587 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e669cfa9d793..f059b5997072 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -11,6 +11,7 @@
11#include <linux/xattr.h> 11#include <linux/xattr.h>
12#include <linux/posix_acl.h> 12#include <linux/posix_acl.h>
13#include <linux/random.h> 13#include <linux/random.h>
14#include <linux/sort.h>
14 15
15#include "super.h" 16#include "super.h"
16#include "mds_client.h" 17#include "mds_client.h"
@@ -254,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode,
254 diri_auth = ci->i_auth_cap->mds; 255 diri_auth = ci->i_auth_cap->mds;
255 spin_unlock(&ci->i_ceph_lock); 256 spin_unlock(&ci->i_ceph_lock);
256 257
258 if (mds == -1) /* CDIR_AUTH_PARENT */
259 mds = diri_auth;
260
257 mutex_lock(&ci->i_fragtree_mutex); 261 mutex_lock(&ci->i_fragtree_mutex);
258 if (ndist == 0 && mds == diri_auth) { 262 if (ndist == 0 && mds == diri_auth) {
259 /* no delegation info needed. */ 263 /* no delegation info needed. */
@@ -300,20 +304,38 @@ out:
300 return err; 304 return err;
301} 305}
302 306
307static int frag_tree_split_cmp(const void *l, const void *r)
308{
309 struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
310 struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
311 return ceph_frag_compare(ls->frag, rs->frag);
312}
313
314static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
315{
316 if (!frag)
317 return f == ceph_frag_make(0, 0);
318 if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
319 return false;
320 return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
321}
322
303static int ceph_fill_fragtree(struct inode *inode, 323static int ceph_fill_fragtree(struct inode *inode,
304 struct ceph_frag_tree_head *fragtree, 324 struct ceph_frag_tree_head *fragtree,
305 struct ceph_mds_reply_dirfrag *dirinfo) 325 struct ceph_mds_reply_dirfrag *dirinfo)
306{ 326{
307 struct ceph_inode_info *ci = ceph_inode(inode); 327 struct ceph_inode_info *ci = ceph_inode(inode);
308 struct ceph_inode_frag *frag; 328 struct ceph_inode_frag *frag, *prev_frag = NULL;
309 struct rb_node *rb_node; 329 struct rb_node *rb_node;
310 int i; 330 unsigned i, split_by, nsplits;
311 u32 id, nsplits; 331 u32 id;
312 bool update = false; 332 bool update = false;
313 333
314 mutex_lock(&ci->i_fragtree_mutex); 334 mutex_lock(&ci->i_fragtree_mutex);
315 nsplits = le32_to_cpu(fragtree->nsplits); 335 nsplits = le32_to_cpu(fragtree->nsplits);
316 if (nsplits) { 336 if (nsplits != ci->i_fragtree_nsplits) {
337 update = true;
338 } else if (nsplits) {
317 i = prandom_u32() % nsplits; 339 i = prandom_u32() % nsplits;
318 id = le32_to_cpu(fragtree->splits[i].frag); 340 id = le32_to_cpu(fragtree->splits[i].frag);
319 if (!__ceph_find_frag(ci, id)) 341 if (!__ceph_find_frag(ci, id))
@@ -332,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode,
332 if (!update) 354 if (!update)
333 goto out_unlock; 355 goto out_unlock;
334 356
357 if (nsplits > 1) {
358 sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
359 frag_tree_split_cmp, NULL);
360 }
361
335 dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); 362 dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
336 rb_node = rb_first(&ci->i_fragtree); 363 rb_node = rb_first(&ci->i_fragtree);
337 for (i = 0; i < nsplits; i++) { 364 for (i = 0; i < nsplits; i++) {
338 id = le32_to_cpu(fragtree->splits[i].frag); 365 id = le32_to_cpu(fragtree->splits[i].frag);
366 split_by = le32_to_cpu(fragtree->splits[i].by);
367 if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
368 pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
369 "frag %x split by %d\n", ceph_vinop(inode),
370 i, nsplits, id, split_by);
371 continue;
372 }
339 frag = NULL; 373 frag = NULL;
340 while (rb_node) { 374 while (rb_node) {
341 frag = rb_entry(rb_node, struct ceph_inode_frag, node); 375 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
@@ -347,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode,
347 break; 381 break;
348 } 382 }
349 rb_node = rb_next(rb_node); 383 rb_node = rb_next(rb_node);
350 rb_erase(&frag->node, &ci->i_fragtree); 384 /* delete stale split/leaf node */
351 kfree(frag); 385 if (frag->split_by > 0 ||
386 !is_frag_child(frag->frag, prev_frag)) {
387 rb_erase(&frag->node, &ci->i_fragtree);
388 if (frag->split_by > 0)
389 ci->i_fragtree_nsplits--;
390 kfree(frag);
391 }
352 frag = NULL; 392 frag = NULL;
353 } 393 }
354 if (!frag) { 394 if (!frag) {
@@ -356,14 +396,23 @@ static int ceph_fill_fragtree(struct inode *inode,
356 if (IS_ERR(frag)) 396 if (IS_ERR(frag))
357 continue; 397 continue;
358 } 398 }
359 frag->split_by = le32_to_cpu(fragtree->splits[i].by); 399 if (frag->split_by == 0)
400 ci->i_fragtree_nsplits++;
401 frag->split_by = split_by;
360 dout(" frag %x split by %d\n", frag->frag, frag->split_by); 402 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
403 prev_frag = frag;
361 } 404 }
362 while (rb_node) { 405 while (rb_node) {
363 frag = rb_entry(rb_node, struct ceph_inode_frag, node); 406 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
364 rb_node = rb_next(rb_node); 407 rb_node = rb_next(rb_node);
365 rb_erase(&frag->node, &ci->i_fragtree); 408 /* delete stale split/leaf node */
366 kfree(frag); 409 if (frag->split_by > 0 ||
410 !is_frag_child(frag->frag, prev_frag)) {
411 rb_erase(&frag->node, &ci->i_fragtree);
412 if (frag->split_by > 0)
413 ci->i_fragtree_nsplits--;
414 kfree(frag);
415 }
367 } 416 }
368out_unlock: 417out_unlock:
369 mutex_unlock(&ci->i_fragtree_mutex); 418 mutex_unlock(&ci->i_fragtree_mutex);
@@ -513,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode)
513 rb_erase(n, &ci->i_fragtree); 562 rb_erase(n, &ci->i_fragtree);
514 kfree(frag); 563 kfree(frag);
515 } 564 }
565 ci->i_fragtree_nsplits = 0;
516 566
517 __ceph_destroy_xattrs(ci); 567 __ceph_destroy_xattrs(ci);
518 if (ci->i_xattrs.blob) 568 if (ci->i_xattrs.blob)
@@ -533,6 +583,11 @@ int ceph_drop_inode(struct inode *inode)
533 return 1; 583 return 1;
534} 584}
535 585
586static inline blkcnt_t calc_inode_blocks(u64 size)
587{
588 return (size + (1<<9) - 1) >> 9;
589}
590
536/* 591/*
537 * Helpers to fill in size, ctime, mtime, and atime. We have to be 592 * Helpers to fill in size, ctime, mtime, and atime. We have to be
538 * careful because either the client or MDS may have more up to date 593 * careful because either the client or MDS may have more up to date
@@ -555,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
555 size = 0; 610 size = 0;
556 } 611 }
557 i_size_write(inode, size); 612 i_size_write(inode, size);
558 inode->i_blocks = (size + (1<<9) - 1) >> 9; 613 inode->i_blocks = calc_inode_blocks(size);
559 ci->i_reported_size = size; 614 ci->i_reported_size = size;
560 if (truncate_seq != ci->i_truncate_seq) { 615 if (truncate_seq != ci->i_truncate_seq) {
561 dout("truncate_seq %u -> %u\n", 616 dout("truncate_seq %u -> %u\n",
@@ -814,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
814 869
815 spin_unlock(&ci->i_ceph_lock); 870 spin_unlock(&ci->i_ceph_lock);
816 871
817 err = -EINVAL; 872 if (symlen != i_size_read(inode)) {
818 if (WARN_ON(symlen != i_size_read(inode))) 873 pr_err("fill_inode %llx.%llx BAD symlink "
819 goto out; 874 "size %lld\n", ceph_vinop(inode),
875 i_size_read(inode));
876 i_size_write(inode, symlen);
877 inode->i_blocks = calc_inode_blocks(symlen);
878 }
820 879
821 err = -ENOMEM; 880 err = -ENOMEM;
822 sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); 881 sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
@@ -1309,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
1309 int i, err = 0; 1368 int i, err = 0;
1310 1369
1311 for (i = 0; i < rinfo->dir_nr; i++) { 1370 for (i = 0; i < rinfo->dir_nr; i++) {
1371 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
1312 struct ceph_vino vino; 1372 struct ceph_vino vino;
1313 struct inode *in; 1373 struct inode *in;
1314 int rc; 1374 int rc;
1315 1375
1316 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); 1376 vino.ino = le64_to_cpu(rde->inode.in->ino);
1317 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); 1377 vino.snap = le64_to_cpu(rde->inode.in->snapid);
1318 1378
1319 in = ceph_get_inode(req->r_dentry->d_sb, vino); 1379 in = ceph_get_inode(req->r_dentry->d_sb, vino);
1320 if (IS_ERR(in)) { 1380 if (IS_ERR(in)) {
@@ -1322,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
1322 dout("new_inode badness got %d\n", err); 1382 dout("new_inode badness got %d\n", err);
1323 continue; 1383 continue;
1324 } 1384 }
1325 rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, 1385 rc = fill_inode(in, NULL, &rde->inode, NULL, session,
1326 req->r_request_started, -1, 1386 req->r_request_started, -1,
1327 &req->r_caps_reservation); 1387 &req->r_caps_reservation);
1328 if (rc < 0) { 1388 if (rc < 0) {
1329 pr_err("fill_inode badness on %p got %d\n", in, rc); 1389 pr_err("fill_inode badness on %p got %d\n", in, rc);
1330 err = rc; 1390 err = rc;
1331 continue;
1332 } 1391 }
1392 iput(in);
1333 } 1393 }
1334 1394
1335 return err; 1395 return err;
@@ -1387,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1387 struct ceph_mds_session *session) 1447 struct ceph_mds_session *session)
1388{ 1448{
1389 struct dentry *parent = req->r_dentry; 1449 struct dentry *parent = req->r_dentry;
1450 struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
1390 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; 1451 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1391 struct qstr dname; 1452 struct qstr dname;
1392 struct dentry *dn; 1453 struct dentry *dn;
@@ -1394,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1394 int err = 0, skipped = 0, ret, i; 1455 int err = 0, skipped = 0, ret, i;
1395 struct inode *snapdir = NULL; 1456 struct inode *snapdir = NULL;
1396 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; 1457 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1397 struct ceph_dentry_info *di;
1398 u32 frag = le32_to_cpu(rhead->args.readdir.frag); 1458 u32 frag = le32_to_cpu(rhead->args.readdir.frag);
1459 u32 last_hash = 0;
1460 u32 fpos_offset;
1399 struct ceph_readdir_cache_control cache_ctl = {}; 1461 struct ceph_readdir_cache_control cache_ctl = {};
1400 1462
1401 if (req->r_aborted) 1463 if (req->r_aborted)
1402 return readdir_prepopulate_inodes_only(req, session); 1464 return readdir_prepopulate_inodes_only(req, session);
1403 1465
1466 if (rinfo->hash_order && req->r_path2) {
1467 last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
1468 req->r_path2, strlen(req->r_path2));
1469 last_hash = ceph_frag_value(last_hash);
1470 }
1471
1404 if (rinfo->dir_dir && 1472 if (rinfo->dir_dir &&
1405 le32_to_cpu(rinfo->dir_dir->frag) != frag) { 1473 le32_to_cpu(rinfo->dir_dir->frag) != frag) {
1406 dout("readdir_prepopulate got new frag %x -> %x\n", 1474 dout("readdir_prepopulate got new frag %x -> %x\n",
1407 frag, le32_to_cpu(rinfo->dir_dir->frag)); 1475 frag, le32_to_cpu(rinfo->dir_dir->frag));
1408 frag = le32_to_cpu(rinfo->dir_dir->frag); 1476 frag = le32_to_cpu(rinfo->dir_dir->frag);
1409 if (ceph_frag_is_leftmost(frag)) 1477 if (!rinfo->hash_order)
1410 req->r_readdir_offset = 2; 1478 req->r_readdir_offset = 2;
1411 else
1412 req->r_readdir_offset = 0;
1413 } 1479 }
1414 1480
1415 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { 1481 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
@@ -1427,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1427 if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { 1493 if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
1428 /* note dir version at start of readdir so we can tell 1494 /* note dir version at start of readdir so we can tell
1429 * if any dentries get dropped */ 1495 * if any dentries get dropped */
1430 struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
1431 req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); 1496 req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
1432 req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); 1497 req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
1433 req->r_readdir_cache_idx = 0; 1498 req->r_readdir_cache_idx = 0;
1434 } 1499 }
1435 1500
1436 cache_ctl.index = req->r_readdir_cache_idx; 1501 cache_ctl.index = req->r_readdir_cache_idx;
1502 fpos_offset = req->r_readdir_offset;
1437 1503
1438 /* FIXME: release caps/leases if error occurs */ 1504 /* FIXME: release caps/leases if error occurs */
1439 for (i = 0; i < rinfo->dir_nr; i++) { 1505 for (i = 0; i < rinfo->dir_nr; i++) {
1506 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
1440 struct ceph_vino vino; 1507 struct ceph_vino vino;
1441 1508
1442 dname.name = rinfo->dir_dname[i]; 1509 dname.name = rde->name;
1443 dname.len = rinfo->dir_dname_len[i]; 1510 dname.len = rde->name_len;
1444 dname.hash = full_name_hash(dname.name, dname.len); 1511 dname.hash = full_name_hash(dname.name, dname.len);
1445 1512
1446 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); 1513 vino.ino = le64_to_cpu(rde->inode.in->ino);
1447 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); 1514 vino.snap = le64_to_cpu(rde->inode.in->snapid);
1515
1516 if (rinfo->hash_order) {
1517 u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
1518 rde->name, rde->name_len);
1519 hash = ceph_frag_value(hash);
1520 if (hash != last_hash)
1521 fpos_offset = 2;
1522 last_hash = hash;
1523 rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
1524 } else {
1525 rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
1526 }
1448 1527
1449retry_lookup: 1528retry_lookup:
1450 dn = d_lookup(parent, &dname); 1529 dn = d_lookup(parent, &dname);
@@ -1490,7 +1569,7 @@ retry_lookup:
1490 } 1569 }
1491 } 1570 }
1492 1571
1493 ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, 1572 ret = fill_inode(in, NULL, &rde->inode, NULL, session,
1494 req->r_request_started, -1, 1573 req->r_request_started, -1,
1495 &req->r_caps_reservation); 1574 &req->r_caps_reservation);
1496 if (ret < 0) { 1575 if (ret < 0) {
@@ -1523,11 +1602,9 @@ retry_lookup:
1523 dn = realdn; 1602 dn = realdn;
1524 } 1603 }
1525 1604
1526 di = dn->d_fsdata; 1605 ceph_dentry(dn)->offset = rde->offset;
1527 di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
1528 1606
1529 update_dentry_lease(dn, rinfo->dir_dlease[i], 1607 update_dentry_lease(dn, rde->lease, req->r_session,
1530 req->r_session,
1531 req->r_request_started); 1608 req->r_request_started);
1532 1609
1533 if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { 1610 if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
@@ -1562,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
1562 spin_lock(&ci->i_ceph_lock); 1639 spin_lock(&ci->i_ceph_lock);
1563 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); 1640 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
1564 i_size_write(inode, size); 1641 i_size_write(inode, size);
1565 inode->i_blocks = (size + (1 << 9) - 1) >> 9; 1642 inode->i_blocks = calc_inode_blocks(size);
1566 1643
1567 /* tell the MDS if we are approaching max_size */ 1644 /* tell the MDS if we are approaching max_size */
1568 if ((size << 1) >= ci->i_max_size && 1645 if ((size << 1) >= ci->i_max_size &&
@@ -1624,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work)
1624 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, 1701 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1625 i_pg_inv_work); 1702 i_pg_inv_work);
1626 struct inode *inode = &ci->vfs_inode; 1703 struct inode *inode = &ci->vfs_inode;
1704 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1627 u32 orig_gen; 1705 u32 orig_gen;
1628 int check = 0; 1706 int check = 0;
1629 1707
1630 mutex_lock(&ci->i_truncate_mutex); 1708 mutex_lock(&ci->i_truncate_mutex);
1709
1710 if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
1711 pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
1712 inode, ceph_ino(inode));
1713 mapping_set_error(inode->i_mapping, -EIO);
1714 truncate_pagecache(inode, 0);
1715 mutex_unlock(&ci->i_truncate_mutex);
1716 goto out;
1717 }
1718
1631 spin_lock(&ci->i_ceph_lock); 1719 spin_lock(&ci->i_ceph_lock);
1632 dout("invalidate_pages %p gen %d revoking %d\n", inode, 1720 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1633 ci->i_rdcache_gen, ci->i_rdcache_revoking); 1721 ci->i_rdcache_gen, ci->i_rdcache_revoking);
@@ -1641,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work)
1641 orig_gen = ci->i_rdcache_gen; 1729 orig_gen = ci->i_rdcache_gen;
1642 spin_unlock(&ci->i_ceph_lock); 1730 spin_unlock(&ci->i_ceph_lock);
1643 1731
1644 truncate_pagecache(inode, 0); 1732 if (invalidate_inode_pages2(inode->i_mapping) < 0) {
1733 pr_err("invalidate_pages %p fails\n", inode);
1734 }
1645 1735
1646 spin_lock(&ci->i_ceph_lock); 1736 spin_lock(&ci->i_ceph_lock);
1647 if (orig_gen == ci->i_rdcache_gen && 1737 if (orig_gen == ci->i_rdcache_gen &&
@@ -1920,8 +2010,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
1920 if ((issued & CEPH_CAP_FILE_EXCL) && 2010 if ((issued & CEPH_CAP_FILE_EXCL) &&
1921 attr->ia_size > inode->i_size) { 2011 attr->ia_size > inode->i_size) {
1922 i_size_write(inode, attr->ia_size); 2012 i_size_write(inode, attr->ia_size);
1923 inode->i_blocks = 2013 inode->i_blocks = calc_inode_blocks(attr->ia_size);
1924 (attr->ia_size + (1 << 9) - 1) >> 9;
1925 inode->i_ctime = attr->ia_ctime; 2014 inode->i_ctime = attr->ia_ctime;
1926 ci->i_reported_size = attr->ia_size; 2015 ci->i_reported_size = attr->ia_size;
1927 dirtied |= CEPH_CAP_FILE_EXCL; 2016 dirtied |= CEPH_CAP_FILE_EXCL;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index f851d8d70158..be6b1657b1af 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
193 if (copy_from_user(&dl, arg, sizeof(dl))) 193 if (copy_from_user(&dl, arg, sizeof(dl)))
194 return -EFAULT; 194 return -EFAULT;
195 195
196 down_read(&osdc->map_sem); 196 down_read(&osdc->lock);
197 r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, 197 r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
198 &dl.object_no, &dl.object_offset, 198 &dl.object_no, &dl.object_offset,
199 &olen); 199 &olen);
200 if (r < 0) { 200 if (r < 0) {
201 up_read(&osdc->map_sem); 201 up_read(&osdc->lock);
202 return -EIO; 202 return -EIO;
203 } 203 }
204 dl.file_offset -= dl.object_offset; 204 dl.file_offset -= dl.object_offset;
@@ -213,15 +213,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
213 ceph_ino(inode), dl.object_no); 213 ceph_ino(inode), dl.object_no);
214 214
215 oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); 215 oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
216 ceph_oid_set_name(&oid, dl.object_name); 216 ceph_oid_printf(&oid, "%s", dl.object_name);
217 217
218 r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); 218 r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
219 if (r < 0) { 219 if (r < 0) {
220 up_read(&osdc->map_sem); 220 up_read(&osdc->lock);
221 return r; 221 return r;
222 } 222 }
223 223
224 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); 224 dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid);
225 if (dl.osd >= 0) { 225 if (dl.osd >= 0) {
226 struct ceph_entity_addr *a = 226 struct ceph_entity_addr *a =
227 ceph_osd_addr(osdc->osdmap, dl.osd); 227 ceph_osd_addr(osdc->osdmap, dl.osd);
@@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
230 } else { 230 } else {
231 memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); 231 memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
232 } 232 }
233 up_read(&osdc->map_sem); 233 up_read(&osdc->lock);
234 234
235 /* send result back to user */ 235 /* send result back to user */
236 if (copy_to_user(arg, &dl, sizeof(dl))) 236 if (copy_to_user(arg, &dl, sizeof(dl)))
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 85b8517f17a0..2103b823bec0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -181,17 +181,18 @@ static int parse_reply_info_dir(void **p, void *end,
181 181
182 ceph_decode_need(p, end, sizeof(num) + 2, bad); 182 ceph_decode_need(p, end, sizeof(num) + 2, bad);
183 num = ceph_decode_32(p); 183 num = ceph_decode_32(p);
184 info->dir_end = ceph_decode_8(p); 184 {
185 info->dir_complete = ceph_decode_8(p); 185 u16 flags = ceph_decode_16(p);
186 info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
187 info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
188 info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
189 }
186 if (num == 0) 190 if (num == 0)
187 goto done; 191 goto done;
188 192
189 BUG_ON(!info->dir_in); 193 BUG_ON(!info->dir_entries);
190 info->dir_dname = (void *)(info->dir_in + num); 194 if ((unsigned long)(info->dir_entries + num) >
191 info->dir_dname_len = (void *)(info->dir_dname + num); 195 (unsigned long)info->dir_entries + info->dir_buf_size) {
192 info->dir_dlease = (void *)(info->dir_dname_len + num);
193 if ((unsigned long)(info->dir_dlease + num) >
194 (unsigned long)info->dir_in + info->dir_buf_size) {
195 pr_err("dir contents are larger than expected\n"); 196 pr_err("dir contents are larger than expected\n");
196 WARN_ON(1); 197 WARN_ON(1);
197 goto bad; 198 goto bad;
@@ -199,21 +200,23 @@ static int parse_reply_info_dir(void **p, void *end,
199 200
200 info->dir_nr = num; 201 info->dir_nr = num;
201 while (num) { 202 while (num) {
203 struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
202 /* dentry */ 204 /* dentry */
203 ceph_decode_need(p, end, sizeof(u32)*2, bad); 205 ceph_decode_need(p, end, sizeof(u32)*2, bad);
204 info->dir_dname_len[i] = ceph_decode_32(p); 206 rde->name_len = ceph_decode_32(p);
205 ceph_decode_need(p, end, info->dir_dname_len[i], bad); 207 ceph_decode_need(p, end, rde->name_len, bad);
206 info->dir_dname[i] = *p; 208 rde->name = *p;
207 *p += info->dir_dname_len[i]; 209 *p += rde->name_len;
208 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], 210 dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
209 info->dir_dname[i]); 211 rde->lease = *p;
210 info->dir_dlease[i] = *p;
211 *p += sizeof(struct ceph_mds_reply_lease); 212 *p += sizeof(struct ceph_mds_reply_lease);
212 213
213 /* inode */ 214 /* inode */
214 err = parse_reply_info_in(p, end, &info->dir_in[i], features); 215 err = parse_reply_info_in(p, end, &rde->inode, features);
215 if (err < 0) 216 if (err < 0)
216 goto out_bad; 217 goto out_bad;
218 /* ceph_readdir_prepopulate() will update it */
219 rde->offset = 0;
217 i++; 220 i++;
218 num--; 221 num--;
219 } 222 }
@@ -345,9 +348,9 @@ out_bad:
345 348
346static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) 349static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
347{ 350{
348 if (!info->dir_in) 351 if (!info->dir_entries)
349 return; 352 return;
350 free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); 353 free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
351} 354}
352 355
353 356
@@ -567,51 +570,23 @@ void ceph_mdsc_release_request(struct kref *kref)
567 kfree(req); 570 kfree(req);
568} 571}
569 572
573DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
574
570/* 575/*
571 * lookup session, bump ref if found. 576 * lookup session, bump ref if found.
572 * 577 *
573 * called under mdsc->mutex. 578 * called under mdsc->mutex.
574 */ 579 */
575static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, 580static struct ceph_mds_request *
576 u64 tid) 581lookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
577{ 582{
578 struct ceph_mds_request *req; 583 struct ceph_mds_request *req;
579 struct rb_node *n = mdsc->request_tree.rb_node;
580
581 while (n) {
582 req = rb_entry(n, struct ceph_mds_request, r_node);
583 if (tid < req->r_tid)
584 n = n->rb_left;
585 else if (tid > req->r_tid)
586 n = n->rb_right;
587 else {
588 ceph_mdsc_get_request(req);
589 return req;
590 }
591 }
592 return NULL;
593}
594 584
595static void __insert_request(struct ceph_mds_client *mdsc, 585 req = lookup_request(&mdsc->request_tree, tid);
596 struct ceph_mds_request *new) 586 if (req)
597{ 587 ceph_mdsc_get_request(req);
598 struct rb_node **p = &mdsc->request_tree.rb_node;
599 struct rb_node *parent = NULL;
600 struct ceph_mds_request *req = NULL;
601 588
602 while (*p) { 589 return req;
603 parent = *p;
604 req = rb_entry(parent, struct ceph_mds_request, r_node);
605 if (new->r_tid < req->r_tid)
606 p = &(*p)->rb_left;
607 else if (new->r_tid > req->r_tid)
608 p = &(*p)->rb_right;
609 else
610 BUG();
611 }
612
613 rb_link_node(&new->r_node, parent, p);
614 rb_insert_color(&new->r_node, &mdsc->request_tree);
615} 590}
616 591
617/* 592/*
@@ -630,7 +605,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
630 req->r_num_caps); 605 req->r_num_caps);
631 dout("__register_request %p tid %lld\n", req, req->r_tid); 606 dout("__register_request %p tid %lld\n", req, req->r_tid);
632 ceph_mdsc_get_request(req); 607 ceph_mdsc_get_request(req);
633 __insert_request(mdsc, req); 608 insert_request(&mdsc->request_tree, req);
634 609
635 req->r_uid = current_fsuid(); 610 req->r_uid = current_fsuid();
636 req->r_gid = current_fsgid(); 611 req->r_gid = current_fsgid();
@@ -663,8 +638,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
663 } 638 }
664 } 639 }
665 640
666 rb_erase(&req->r_node, &mdsc->request_tree); 641 erase_request(&mdsc->request_tree, req);
667 RB_CLEAR_NODE(&req->r_node);
668 642
669 if (req->r_unsafe_dir && req->r_got_unsafe) { 643 if (req->r_unsafe_dir && req->r_got_unsafe) {
670 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); 644 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
@@ -868,12 +842,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
868 int metadata_bytes = 0; 842 int metadata_bytes = 0;
869 int metadata_key_count = 0; 843 int metadata_key_count = 0;
870 struct ceph_options *opt = mdsc->fsc->client->options; 844 struct ceph_options *opt = mdsc->fsc->client->options;
845 struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
871 void *p; 846 void *p;
872 847
873 const char* metadata[][2] = { 848 const char* metadata[][2] = {
874 {"hostname", utsname()->nodename}, 849 {"hostname", utsname()->nodename},
875 {"kernel_version", utsname()->release}, 850 {"kernel_version", utsname()->release},
876 {"entity_id", opt->name ? opt->name : ""}, 851 {"entity_id", opt->name ? : ""},
852 {"root", fsopt->server_path ? : "/"},
877 {NULL, NULL} 853 {NULL, NULL}
878 }; 854 };
879 855
@@ -1149,9 +1125,11 @@ out:
1149static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, 1125static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1150 void *arg) 1126 void *arg)
1151{ 1127{
1128 struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
1152 struct ceph_inode_info *ci = ceph_inode(inode); 1129 struct ceph_inode_info *ci = ceph_inode(inode);
1153 LIST_HEAD(to_remove); 1130 LIST_HEAD(to_remove);
1154 int drop = 0; 1131 bool drop = false;
1132 bool invalidate = false;
1155 1133
1156 dout("removing cap %p, ci is %p, inode is %p\n", 1134 dout("removing cap %p, ci is %p, inode is %p\n",
1157 cap, ci, &ci->vfs_inode); 1135 cap, ci, &ci->vfs_inode);
@@ -1159,8 +1137,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1159 __ceph_remove_cap(cap, false); 1137 __ceph_remove_cap(cap, false);
1160 if (!ci->i_auth_cap) { 1138 if (!ci->i_auth_cap) {
1161 struct ceph_cap_flush *cf; 1139 struct ceph_cap_flush *cf;
1162 struct ceph_mds_client *mdsc = 1140 struct ceph_mds_client *mdsc = fsc->mdsc;
1163 ceph_sb_to_client(inode->i_sb)->mdsc; 1141
1142 ci->i_ceph_flags |= CEPH_I_CAP_DROPPED;
1143
1144 if (ci->i_wrbuffer_ref > 0 &&
1145 ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
1146 invalidate = true;
1164 1147
1165 while (true) { 1148 while (true) {
1166 struct rb_node *n = rb_first(&ci->i_cap_flush_tree); 1149 struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
@@ -1183,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1183 inode, ceph_ino(inode)); 1166 inode, ceph_ino(inode));
1184 ci->i_dirty_caps = 0; 1167 ci->i_dirty_caps = 0;
1185 list_del_init(&ci->i_dirty_item); 1168 list_del_init(&ci->i_dirty_item);
1186 drop = 1; 1169 drop = true;
1187 } 1170 }
1188 if (!list_empty(&ci->i_flushing_item)) { 1171 if (!list_empty(&ci->i_flushing_item)) {
1189 pr_warn_ratelimited( 1172 pr_warn_ratelimited(
@@ -1193,7 +1176,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1193 ci->i_flushing_caps = 0; 1176 ci->i_flushing_caps = 0;
1194 list_del_init(&ci->i_flushing_item); 1177 list_del_init(&ci->i_flushing_item);
1195 mdsc->num_cap_flushing--; 1178 mdsc->num_cap_flushing--;
1196 drop = 1; 1179 drop = true;
1197 } 1180 }
1198 spin_unlock(&mdsc->cap_dirty_lock); 1181 spin_unlock(&mdsc->cap_dirty_lock);
1199 1182
@@ -1210,7 +1193,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1210 list_del(&cf->list); 1193 list_del(&cf->list);
1211 ceph_free_cap_flush(cf); 1194 ceph_free_cap_flush(cf);
1212 } 1195 }
1213 while (drop--) 1196
1197 wake_up_all(&ci->i_cap_wq);
1198 if (invalidate)
1199 ceph_queue_invalidate(inode);
1200 if (drop)
1214 iput(inode); 1201 iput(inode);
1215 return 0; 1202 return 0;
1216} 1203}
@@ -1220,12 +1207,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1220 */ 1207 */
1221static void remove_session_caps(struct ceph_mds_session *session) 1208static void remove_session_caps(struct ceph_mds_session *session)
1222{ 1209{
1210 struct ceph_fs_client *fsc = session->s_mdsc->fsc;
1211 struct super_block *sb = fsc->sb;
1223 dout("remove_session_caps on %p\n", session); 1212 dout("remove_session_caps on %p\n", session);
1224 iterate_session_caps(session, remove_session_caps_cb, NULL); 1213 iterate_session_caps(session, remove_session_caps_cb, fsc);
1225 1214
1226 spin_lock(&session->s_cap_lock); 1215 spin_lock(&session->s_cap_lock);
1227 if (session->s_nr_caps > 0) { 1216 if (session->s_nr_caps > 0) {
1228 struct super_block *sb = session->s_mdsc->fsc->sb;
1229 struct inode *inode; 1217 struct inode *inode;
1230 struct ceph_cap *cap, *prev = NULL; 1218 struct ceph_cap *cap, *prev = NULL;
1231 struct ceph_vino vino; 1219 struct ceph_vino vino;
@@ -1270,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
1270{ 1258{
1271 struct ceph_inode_info *ci = ceph_inode(inode); 1259 struct ceph_inode_info *ci = ceph_inode(inode);
1272 1260
1273 wake_up_all(&ci->i_cap_wq);
1274 if (arg) { 1261 if (arg) {
1275 spin_lock(&ci->i_ceph_lock); 1262 spin_lock(&ci->i_ceph_lock);
1276 ci->i_wanted_max_size = 0; 1263 ci->i_wanted_max_size = 0;
1277 ci->i_requested_max_size = 0; 1264 ci->i_requested_max_size = 0;
1278 spin_unlock(&ci->i_ceph_lock); 1265 spin_unlock(&ci->i_ceph_lock);
1279 } 1266 }
1267 wake_up_all(&ci->i_cap_wq);
1280 return 0; 1268 return 0;
1281} 1269}
1282 1270
@@ -1671,8 +1659,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
1671 struct ceph_inode_info *ci = ceph_inode(dir); 1659 struct ceph_inode_info *ci = ceph_inode(dir);
1672 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; 1660 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1673 struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; 1661 struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
1674 size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + 1662 size_t size = sizeof(struct ceph_mds_reply_dir_entry);
1675 sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
1676 int order, num_entries; 1663 int order, num_entries;
1677 1664
1678 spin_lock(&ci->i_ceph_lock); 1665 spin_lock(&ci->i_ceph_lock);
@@ -1683,14 +1670,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
1683 1670
1684 order = get_order(size * num_entries); 1671 order = get_order(size * num_entries);
1685 while (order >= 0) { 1672 while (order >= 0) {
1686 rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL | 1673 rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
1687 __GFP_NOWARN, 1674 __GFP_NOWARN,
1688 order); 1675 order);
1689 if (rinfo->dir_in) 1676 if (rinfo->dir_entries)
1690 break; 1677 break;
1691 order--; 1678 order--;
1692 } 1679 }
1693 if (!rinfo->dir_in) 1680 if (!rinfo->dir_entries)
1694 return -ENOMEM; 1681 return -ENOMEM;
1695 1682
1696 num_entries = (PAGE_SIZE << order) / size; 1683 num_entries = (PAGE_SIZE << order) / size;
@@ -1722,6 +1709,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1722 INIT_LIST_HEAD(&req->r_unsafe_target_item); 1709 INIT_LIST_HEAD(&req->r_unsafe_target_item);
1723 req->r_fmode = -1; 1710 req->r_fmode = -1;
1724 kref_init(&req->r_kref); 1711 kref_init(&req->r_kref);
1712 RB_CLEAR_NODE(&req->r_node);
1725 INIT_LIST_HEAD(&req->r_wait); 1713 INIT_LIST_HEAD(&req->r_wait);
1726 init_completion(&req->r_completion); 1714 init_completion(&req->r_completion);
1727 init_completion(&req->r_safe_completion); 1715 init_completion(&req->r_safe_completion);
@@ -2414,7 +2402,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2414 /* get request, session */ 2402 /* get request, session */
2415 tid = le64_to_cpu(msg->hdr.tid); 2403 tid = le64_to_cpu(msg->hdr.tid);
2416 mutex_lock(&mdsc->mutex); 2404 mutex_lock(&mdsc->mutex);
2417 req = __lookup_request(mdsc, tid); 2405 req = lookup_get_request(mdsc, tid);
2418 if (!req) { 2406 if (!req) {
2419 dout("handle_reply on unknown tid %llu\n", tid); 2407 dout("handle_reply on unknown tid %llu\n", tid);
2420 mutex_unlock(&mdsc->mutex); 2408 mutex_unlock(&mdsc->mutex);
@@ -2604,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
2604 fwd_seq = ceph_decode_32(&p); 2592 fwd_seq = ceph_decode_32(&p);
2605 2593
2606 mutex_lock(&mdsc->mutex); 2594 mutex_lock(&mdsc->mutex);
2607 req = __lookup_request(mdsc, tid); 2595 req = lookup_get_request(mdsc, tid);
2608 if (!req) { 2596 if (!req) {
2609 dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); 2597 dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
2610 goto out; /* dup reply? */ 2598 goto out; /* dup reply? */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ee69a537dba5..e7d38aac7109 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -47,6 +47,14 @@ struct ceph_mds_reply_info_in {
47 u32 pool_ns_len; 47 u32 pool_ns_len;
48}; 48};
49 49
50struct ceph_mds_reply_dir_entry {
51 char *name;
52 u32 name_len;
53 struct ceph_mds_reply_lease *lease;
54 struct ceph_mds_reply_info_in inode;
55 loff_t offset;
56};
57
50/* 58/*
51 * parsed info about an mds reply, including information about 59 * parsed info about an mds reply, including information about
52 * either: 1) the target inode and/or its parent directory and dentry, 60 * either: 1) the target inode and/or its parent directory and dentry,
@@ -73,11 +81,10 @@ struct ceph_mds_reply_info_parsed {
73 struct ceph_mds_reply_dirfrag *dir_dir; 81 struct ceph_mds_reply_dirfrag *dir_dir;
74 size_t dir_buf_size; 82 size_t dir_buf_size;
75 int dir_nr; 83 int dir_nr;
76 char **dir_dname; 84 bool dir_complete;
77 u32 *dir_dname_len; 85 bool dir_end;
78 struct ceph_mds_reply_lease **dir_dlease; 86 bool hash_order;
79 struct ceph_mds_reply_info_in *dir_in; 87 struct ceph_mds_reply_dir_entry *dir_entries;
80 u8 dir_complete, dir_end;
81 }; 88 };
82 89
83 /* for create results */ 90 /* for create results */
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 261531e55e9d..8c3591a7fbae 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
54 const void *start = *p; 54 const void *start = *p;
55 int i, j, n; 55 int i, j, n;
56 int err = -EINVAL; 56 int err = -EINVAL;
57 u16 version; 57 u8 mdsmap_v, mdsmap_cv;
58 58
59 m = kzalloc(sizeof(*m), GFP_NOFS); 59 m = kzalloc(sizeof(*m), GFP_NOFS);
60 if (m == NULL) 60 if (m == NULL)
61 return ERR_PTR(-ENOMEM); 61 return ERR_PTR(-ENOMEM);
62 62
63 ceph_decode_16_safe(p, end, version, bad); 63 ceph_decode_need(p, end, 1 + 1, bad);
64 if (version > 3) { 64 mdsmap_v = ceph_decode_8(p);
65 pr_warn("got mdsmap version %d > 3, failing", version); 65 mdsmap_cv = ceph_decode_8(p);
66 goto bad; 66 if (mdsmap_v >= 4) {
67 u32 mdsmap_len;
68 ceph_decode_32_safe(p, end, mdsmap_len, bad);
69 if (end < *p + mdsmap_len)
70 goto bad;
71 end = *p + mdsmap_len;
67 } 72 }
68 73
69 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); 74 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
@@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
87 u32 namelen; 92 u32 namelen;
88 s32 mds, inc, state; 93 s32 mds, inc, state;
89 u64 state_seq; 94 u64 state_seq;
90 u8 infoversion; 95 u8 info_v;
96 void *info_end = NULL;
91 struct ceph_entity_addr addr; 97 struct ceph_entity_addr addr;
92 u32 num_export_targets; 98 u32 num_export_targets;
93 void *pexport_targets = NULL; 99 void *pexport_targets = NULL;
94 struct ceph_timespec laggy_since; 100 struct ceph_timespec laggy_since;
95 struct ceph_mds_info *info; 101 struct ceph_mds_info *info;
96 102
97 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); 103 ceph_decode_need(p, end, sizeof(u64) + 1, bad);
98 global_id = ceph_decode_64(p); 104 global_id = ceph_decode_64(p);
99 infoversion = ceph_decode_8(p); 105 info_v= ceph_decode_8(p);
106 if (info_v >= 4) {
107 u32 info_len;
108 u8 info_cv;
109 ceph_decode_need(p, end, 1 + sizeof(u32), bad);
110 info_cv = ceph_decode_8(p);
111 info_len = ceph_decode_32(p);
112 info_end = *p + info_len;
113 if (info_end > end)
114 goto bad;
115 }
116
117 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
100 *p += sizeof(u64); 118 *p += sizeof(u64);
101 namelen = ceph_decode_32(p); /* skip mds name */ 119 namelen = ceph_decode_32(p); /* skip mds name */
102 *p += namelen; 120 *p += namelen;
@@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
115 *p += sizeof(u32); 133 *p += sizeof(u32);
116 ceph_decode_32_safe(p, end, namelen, bad); 134 ceph_decode_32_safe(p, end, namelen, bad);
117 *p += namelen; 135 *p += namelen;
118 if (infoversion >= 2) { 136 if (info_v >= 2) {
119 ceph_decode_32_safe(p, end, num_export_targets, bad); 137 ceph_decode_32_safe(p, end, num_export_targets, bad);
120 pexport_targets = *p; 138 pexport_targets = *p;
121 *p += num_export_targets * sizeof(u32); 139 *p += num_export_targets * sizeof(u32);
@@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
123 num_export_targets = 0; 141 num_export_targets = 0;
124 } 142 }
125 143
144 if (info_end && *p != info_end) {
145 if (*p > info_end)
146 goto bad;
147 *p = info_end;
148 }
149
126 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", 150 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
127 i+1, n, global_id, mds, inc, 151 i+1, n, global_id, mds, inc,
128 ceph_pr_addr(&addr.in_addr), 152 ceph_pr_addr(&addr.in_addr),
@@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
163 m->m_cas_pg_pool = ceph_decode_64(p); 187 m->m_cas_pg_pool = ceph_decode_64(p);
164 188
165 /* ok, we don't care about the rest. */ 189 /* ok, we don't care about the rest. */
190 *p = end;
166 dout("mdsmap_decode success epoch %u\n", m->m_epoch); 191 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
167 return m; 192 return m;
168 193
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f12d5e2955c2..91e02481ce06 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
108 * mount options 108 * mount options
109 */ 109 */
110enum { 110enum {
111 Opt_mds_namespace,
111 Opt_wsize, 112 Opt_wsize,
112 Opt_rsize, 113 Opt_rsize,
113 Opt_rasize, 114 Opt_rasize,
@@ -143,6 +144,7 @@ enum {
143}; 144};
144 145
145static match_table_t fsopt_tokens = { 146static match_table_t fsopt_tokens = {
147 {Opt_mds_namespace, "mds_namespace=%d"},
146 {Opt_wsize, "wsize=%d"}, 148 {Opt_wsize, "wsize=%d"},
147 {Opt_rsize, "rsize=%d"}, 149 {Opt_rsize, "rsize=%d"},
148 {Opt_rasize, "rasize=%d"}, 150 {Opt_rasize, "rasize=%d"},
@@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private)
212 break; 214 break;
213 215
214 /* misc */ 216 /* misc */
217 case Opt_mds_namespace:
218 fsopt->mds_namespace = intval;
219 break;
215 case Opt_wsize: 220 case Opt_wsize:
216 fsopt->wsize = intval; 221 fsopt->wsize = intval;
217 break; 222 break;
@@ -297,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
297{ 302{
298 dout("destroy_mount_options %p\n", args); 303 dout("destroy_mount_options %p\n", args);
299 kfree(args->snapdir_name); 304 kfree(args->snapdir_name);
305 kfree(args->server_path);
300 kfree(args); 306 kfree(args);
301} 307}
302 308
@@ -328,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
328 if (ret) 334 if (ret)
329 return ret; 335 return ret;
330 336
337 ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
338 if (ret)
339 return ret;
340
331 return ceph_compare_options(new_opt, fsc->client); 341 return ceph_compare_options(new_opt, fsc->client);
332} 342}
333 343
334static int parse_mount_options(struct ceph_mount_options **pfsopt, 344static int parse_mount_options(struct ceph_mount_options **pfsopt,
335 struct ceph_options **popt, 345 struct ceph_options **popt,
336 int flags, char *options, 346 int flags, char *options,
337 const char *dev_name, 347 const char *dev_name)
338 const char **path)
339{ 348{
340 struct ceph_mount_options *fsopt; 349 struct ceph_mount_options *fsopt;
341 const char *dev_name_end; 350 const char *dev_name_end;
@@ -367,6 +376,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
367 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; 376 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
368 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; 377 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
369 fsopt->congestion_kb = default_congestion_kb(); 378 fsopt->congestion_kb = default_congestion_kb();
379 fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE;
370 380
371 /* 381 /*
372 * Distinguish the server list from the path in "dev_name". 382 * Distinguish the server list from the path in "dev_name".
@@ -380,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
380 */ 390 */
381 dev_name_end = strchr(dev_name, '/'); 391 dev_name_end = strchr(dev_name, '/');
382 if (dev_name_end) { 392 if (dev_name_end) {
383 /* skip over leading '/' for path */ 393 fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
384 *path = dev_name_end + 1; 394 if (!fsopt->server_path) {
395 err = -ENOMEM;
396 goto out;
397 }
385 } else { 398 } else {
386 /* path is empty */
387 dev_name_end = dev_name + strlen(dev_name); 399 dev_name_end = dev_name + strlen(dev_name);
388 *path = dev_name_end;
389 } 400 }
390 err = -EINVAL; 401 err = -EINVAL;
391 dev_name_end--; /* back up to ':' separator */ 402 dev_name_end--; /* back up to ':' separator */
@@ -395,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
395 goto out; 406 goto out;
396 } 407 }
397 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); 408 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
398 dout("server path '%s'\n", *path); 409 if (fsopt->server_path)
410 dout("server path '%s'\n", fsopt->server_path);
399 411
400 *popt = ceph_parse_options(options, dev_name, dev_name_end, 412 *popt = ceph_parse_options(options, dev_name, dev_name_end,
401 parse_fsopt_token, (void *)fsopt); 413 parse_fsopt_token, (void *)fsopt);
@@ -457,6 +469,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
457 seq_puts(m, ",noacl"); 469 seq_puts(m, ",noacl");
458#endif 470#endif
459 471
472 if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE)
473 seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace);
460 if (fsopt->wsize) 474 if (fsopt->wsize)
461 seq_printf(m, ",wsize=%d", fsopt->wsize); 475 seq_printf(m, ",wsize=%d", fsopt->wsize);
462 if (fsopt->rsize != CEPH_RSIZE_DEFAULT) 476 if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -511,9 +525,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
511{ 525{
512 struct ceph_fs_client *fsc; 526 struct ceph_fs_client *fsc;
513 const u64 supported_features = 527 const u64 supported_features =
514 CEPH_FEATURE_FLOCK | 528 CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH |
515 CEPH_FEATURE_DIRLAYOUTHASH | 529 CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA;
516 CEPH_FEATURE_MDS_INLINE_DATA;
517 const u64 required_features = 0; 530 const u64 required_features = 0;
518 int page_count; 531 int page_count;
519 size_t size; 532 size_t size;
@@ -530,6 +543,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
530 goto fail; 543 goto fail;
531 } 544 }
532 fsc->client->extra_mon_dispatch = extra_mon_dispatch; 545 fsc->client->extra_mon_dispatch = extra_mon_dispatch;
546 fsc->client->monc.fs_cluster_id = fsopt->mds_namespace;
533 ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); 547 ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
534 548
535 fsc->mount_options = fsopt; 549 fsc->mount_options = fsopt;
@@ -785,8 +799,7 @@ out:
785/* 799/*
786 * mount: join the ceph cluster, and open root directory. 800 * mount: join the ceph cluster, and open root directory.
787 */ 801 */
788static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, 802static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
789 const char *path)
790{ 803{
791 int err; 804 int err;
792 unsigned long started = jiffies; /* note the start time */ 805 unsigned long started = jiffies; /* note the start time */
@@ -815,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
815 goto fail; 828 goto fail;
816 } 829 }
817 830
818 if (path[0] == 0) { 831 if (!fsc->mount_options->server_path) {
819 root = fsc->sb->s_root; 832 root = fsc->sb->s_root;
820 dget(root); 833 dget(root);
821 } else { 834 } else {
822 dout("mount opening base mountpoint\n"); 835 const char *path = fsc->mount_options->server_path + 1;
836 dout("mount opening path %s\n", path);
823 root = open_root_dentry(fsc, path, started); 837 root = open_root_dentry(fsc, path, started);
824 if (IS_ERR(root)) { 838 if (IS_ERR(root)) {
825 err = PTR_ERR(root); 839 err = PTR_ERR(root);
@@ -935,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
935 struct dentry *res; 949 struct dentry *res;
936 int err; 950 int err;
937 int (*compare_super)(struct super_block *, void *) = ceph_compare_super; 951 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
938 const char *path = NULL;
939 struct ceph_mount_options *fsopt = NULL; 952 struct ceph_mount_options *fsopt = NULL;
940 struct ceph_options *opt = NULL; 953 struct ceph_options *opt = NULL;
941 954
@@ -944,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
944#ifdef CONFIG_CEPH_FS_POSIX_ACL 957#ifdef CONFIG_CEPH_FS_POSIX_ACL
945 flags |= MS_POSIXACL; 958 flags |= MS_POSIXACL;
946#endif 959#endif
947 err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); 960 err = parse_mount_options(&fsopt, &opt, flags, data, dev_name);
948 if (err < 0) { 961 if (err < 0) {
949 res = ERR_PTR(err); 962 res = ERR_PTR(err);
950 goto out_final; 963 goto out_final;
@@ -987,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
987 } 1000 }
988 } 1001 }
989 1002
990 res = ceph_real_mount(fsc, path); 1003 res = ceph_real_mount(fsc);
991 if (IS_ERR(res)) 1004 if (IS_ERR(res))
992 goto out_splat; 1005 goto out_splat;
993 dout("root %p inode %p ino %llx.%llx\n", res, 1006 dout("root %p inode %p ino %llx.%llx\n", res,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7b99eb756477..0130a8592191 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -62,6 +62,7 @@ struct ceph_mount_options {
62 int cap_release_safety; 62 int cap_release_safety;
63 int max_readdir; /* max readdir result (entires) */ 63 int max_readdir; /* max readdir result (entires) */
64 int max_readdir_bytes; /* max readdir result (bytes) */ 64 int max_readdir_bytes; /* max readdir result (bytes) */
65 int mds_namespace;
65 66
66 /* 67 /*
67 * everything above this point can be memcmp'd; everything below 68 * everything above this point can be memcmp'd; everything below
@@ -69,6 +70,7 @@ struct ceph_mount_options {
69 */ 70 */
70 71
71 char *snapdir_name; /* default ".snap" */ 72 char *snapdir_name; /* default ".snap" */
73 char *server_path; /* default "/" */
72}; 74};
73 75
74struct ceph_fs_client { 76struct ceph_fs_client {
@@ -295,6 +297,7 @@ struct ceph_inode_info {
295 u64 i_files, i_subdirs; 297 u64 i_files, i_subdirs;
296 298
297 struct rb_root i_fragtree; 299 struct rb_root i_fragtree;
300 int i_fragtree_nsplits;
298 struct mutex i_fragtree_mutex; 301 struct mutex i_fragtree_mutex;
299 302
300 struct ceph_inode_xattrs_info i_xattrs; 303 struct ceph_inode_xattrs_info i_xattrs;
@@ -469,6 +472,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
469#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ 472#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */
470#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ 473#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
471#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ 474#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */
475#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */
472 476
473static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, 477static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
474 long long release_count, 478 long long release_count,
@@ -537,11 +541,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
537 return (struct ceph_dentry_info *)dentry->d_fsdata; 541 return (struct ceph_dentry_info *)dentry->d_fsdata;
538} 542}
539 543
540static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
541{
542 return ((loff_t)frag << 32) | (loff_t)off;
543}
544
545/* 544/*
546 * caps helpers 545 * caps helpers
547 */ 546 */
@@ -632,7 +631,6 @@ struct ceph_file_info {
632 struct ceph_mds_request *last_readdir; 631 struct ceph_mds_request *last_readdir;
633 632
634 /* readdir: position within a frag */ 633 /* readdir: position within a frag */
635 unsigned offset; /* offset of last chunk, adjusted for . and .. */
636 unsigned next_offset; /* offset of next chunk (last_name's + 1) */ 634 unsigned next_offset; /* offset of next chunk (last_name's + 1) */
637 char *last_name; /* last entry in previous chunk */ 635 char *last_name; /* last entry in previous chunk */
638 long long dir_release_count; 636 long long dir_release_count;
@@ -927,6 +925,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
927/* file.c */ 925/* file.c */
928extern const struct file_operations ceph_file_fops; 926extern const struct file_operations ceph_file_fops;
929 927
928extern int ceph_renew_caps(struct inode *inode);
930extern int ceph_open(struct inode *inode, struct file *file); 929extern int ceph_open(struct inode *inode, struct file *file);
931extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, 930extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
932 struct file *file, unsigned flags, umode_t mode, 931 struct file *file, unsigned flags, umode_t mode,
@@ -942,6 +941,7 @@ extern const struct inode_operations ceph_snapdir_iops;
942extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, 941extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
943 ceph_snapdir_dentry_ops; 942 ceph_snapdir_dentry_ops;
944 943
944extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
945extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); 945extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
946extern int ceph_handle_snapdir(struct ceph_mds_request *req, 946extern int ceph_handle_snapdir(struct ceph_mds_request *req,
947 struct dentry *dentry, int err); 947 struct dentry *dentry, int err);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 0d66722c6a52..dacc1bd85629 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -77,7 +77,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
77 char buf[128]; 77 char buf[128];
78 78
79 dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); 79 dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
80 down_read(&osdc->map_sem); 80 down_read(&osdc->lock);
81 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); 81 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
82 if (pool_name) { 82 if (pool_name) {
83 size_t len = strlen(pool_name); 83 size_t len = strlen(pool_name);
@@ -109,7 +109,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
109 ret = -ERANGE; 109 ret = -ERANGE;
110 } 110 }
111 } 111 }
112 up_read(&osdc->map_sem); 112 up_read(&osdc->lock);
113 return ret; 113 return ret;
114} 114}
115 115
@@ -143,13 +143,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
143 s64 pool = ceph_file_layout_pg_pool(ci->i_layout); 143 s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
144 const char *pool_name; 144 const char *pool_name;
145 145
146 down_read(&osdc->map_sem); 146 down_read(&osdc->lock);
147 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); 147 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
148 if (pool_name) 148 if (pool_name)
149 ret = snprintf(val, size, "%s", pool_name); 149 ret = snprintf(val, size, "%s", pool_name);
150 else 150 else
151 ret = snprintf(val, size, "%lld", (unsigned long long)pool); 151 ret = snprintf(val, size, "%lld", (unsigned long long)pool);
152 up_read(&osdc->map_sem); 152 up_read(&osdc->lock);
153 return ret; 153 return ret;
154} 154}
155 155
@@ -862,6 +862,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
862 struct ceph_mds_request *req; 862 struct ceph_mds_request *req;
863 struct ceph_mds_client *mdsc = fsc->mdsc; 863 struct ceph_mds_client *mdsc = fsc->mdsc;
864 struct ceph_pagelist *pagelist = NULL; 864 struct ceph_pagelist *pagelist = NULL;
865 int op = CEPH_MDS_OP_SETXATTR;
865 int err; 866 int err;
866 867
867 if (size > 0) { 868 if (size > 0) {
@@ -875,20 +876,21 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
875 if (err) 876 if (err)
876 goto out; 877 goto out;
877 } else if (!value) { 878 } else if (!value) {
878 flags |= CEPH_XATTR_REMOVE; 879 if (flags & CEPH_XATTR_REPLACE)
880 op = CEPH_MDS_OP_RMXATTR;
881 else
882 flags |= CEPH_XATTR_REMOVE;
879 } 883 }
880 884
881 dout("setxattr value=%.*s\n", (int)size, value); 885 dout("setxattr value=%.*s\n", (int)size, value);
882 886
883 /* do request */ 887 /* do request */
884 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, 888 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
885 USE_AUTH_MDS);
886 if (IS_ERR(req)) { 889 if (IS_ERR(req)) {
887 err = PTR_ERR(req); 890 err = PTR_ERR(req);
888 goto out; 891 goto out;
889 } 892 }
890 893
891 req->r_args.setxattr.flags = cpu_to_le32(flags);
892 req->r_path2 = kstrdup(name, GFP_NOFS); 894 req->r_path2 = kstrdup(name, GFP_NOFS);
893 if (!req->r_path2) { 895 if (!req->r_path2) {
894 ceph_mdsc_put_request(req); 896 ceph_mdsc_put_request(req);
@@ -896,8 +898,11 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
896 goto out; 898 goto out;
897 } 899 }
898 900
899 req->r_pagelist = pagelist; 901 if (op == CEPH_MDS_OP_SETXATTR) {
900 pagelist = NULL; 902 req->r_args.setxattr.flags = cpu_to_le32(flags);
903 req->r_pagelist = pagelist;
904 pagelist = NULL;
905 }
901 906
902 req->r_inode = inode; 907 req->r_inode = inode;
903 ihold(inode); 908 ihold(inode);
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
index b827e066e55a..146507df8650 100644
--- a/include/linux/ceph/ceph_frag.h
+++ b/include/linux/ceph/ceph_frag.h
@@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
51 return ceph_frag_make(newbits, 51 return ceph_frag_make(newbits,
52 ceph_frag_value(f) | (i << (24 - newbits))); 52 ceph_frag_value(f) | (i << (24 - newbits)));
53} 53}
54static inline int ceph_frag_is_leftmost(__u32 f) 54static inline bool ceph_frag_is_leftmost(__u32 f)
55{ 55{
56 return ceph_frag_value(f) == 0; 56 return ceph_frag_value(f) == 0;
57} 57}
58static inline int ceph_frag_is_rightmost(__u32 f) 58static inline bool ceph_frag_is_rightmost(__u32 f)
59{ 59{
60 return ceph_frag_value(f) == ceph_frag_mask(f); 60 return ceph_frag_value(f) == ceph_frag_mask(f);
61} 61}
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 37f28bf55ce4..dfce616002ad 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -153,8 +153,9 @@ struct ceph_dir_layout {
153 153
154/* watch-notify operations */ 154/* watch-notify operations */
155enum { 155enum {
156 WATCH_NOTIFY = 1, /* notifying watcher */ 156 CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */
157 WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */ 157 CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */
158 CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */
158}; 159};
159 160
160 161
@@ -207,6 +208,8 @@ struct ceph_mon_subscribe_ack {
207 struct ceph_fsid fsid; 208 struct ceph_fsid fsid;
208} __attribute__ ((packed)); 209} __attribute__ ((packed));
209 210
211#define CEPH_FS_CLUSTER_ID_NONE -1
212
210/* 213/*
211 * mdsmap flags 214 * mdsmap flags
212 */ 215 */
@@ -344,6 +347,18 @@ extern const char *ceph_mds_op_name(int op);
344#define CEPH_XATTR_REPLACE (1 << 1) 347#define CEPH_XATTR_REPLACE (1 << 1)
345#define CEPH_XATTR_REMOVE (1 << 31) 348#define CEPH_XATTR_REMOVE (1 << 31)
346 349
350/*
351 * readdir request flags;
352 */
353#define CEPH_READDIR_REPLY_BITFLAGS (1<<0)
354
355/*
356 * readdir reply flags.
357 */
358#define CEPH_READDIR_FRAG_END (1<<0)
359#define CEPH_READDIR_FRAG_COMPLETE (1<<8)
360#define CEPH_READDIR_HASH_ORDER (1<<9)
361
347union ceph_mds_request_args { 362union ceph_mds_request_args {
348 struct { 363 struct {
349 __le32 mask; /* CEPH_CAP_* */ 364 __le32 mask; /* CEPH_CAP_* */
@@ -361,6 +376,7 @@ union ceph_mds_request_args {
361 __le32 frag; /* which dir fragment */ 376 __le32 frag; /* which dir fragment */
362 __le32 max_entries; /* how many dentries to grab */ 377 __le32 max_entries; /* how many dentries to grab */
363 __le32 max_bytes; 378 __le32 max_bytes;
379 __le16 flags;
364 } __attribute__ ((packed)) readdir; 380 } __attribute__ ((packed)) readdir;
365 struct { 381 struct {
366 __le32 mode; 382 __le32 mode;
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index a6ef9cc267ec..19e9932f3e77 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -47,7 +47,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n)
47/* 47/*
48 * bounds check input. 48 * bounds check input.
49 */ 49 */
50static inline int ceph_has_room(void **p, void *end, size_t n) 50static inline bool ceph_has_room(void **p, void *end, size_t n)
51{ 51{
52 return end >= *p && n <= end - *p; 52 return end >= *p && n <= end - *p;
53} 53}
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index db92a8d4926e..690985daad1c 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len)
180 (off >> PAGE_SHIFT); 180 (off >> PAGE_SHIFT);
181} 181}
182 182
183/*
184 * These are not meant to be generic - an integer key is assumed.
185 */
186#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
187static void insert_##name(struct rb_root *root, type *t) \
188{ \
189 struct rb_node **n = &root->rb_node; \
190 struct rb_node *parent = NULL; \
191 \
192 BUG_ON(!RB_EMPTY_NODE(&t->nodefld)); \
193 \
194 while (*n) { \
195 type *cur = rb_entry(*n, type, nodefld); \
196 \
197 parent = *n; \
198 if (t->keyfld < cur->keyfld) \
199 n = &(*n)->rb_left; \
200 else if (t->keyfld > cur->keyfld) \
201 n = &(*n)->rb_right; \
202 else \
203 BUG(); \
204 } \
205 \
206 rb_link_node(&t->nodefld, parent, n); \
207 rb_insert_color(&t->nodefld, root); \
208} \
209static void erase_##name(struct rb_root *root, type *t) \
210{ \
211 BUG_ON(RB_EMPTY_NODE(&t->nodefld)); \
212 rb_erase(&t->nodefld, root); \
213 RB_CLEAR_NODE(&t->nodefld); \
214}
215
216#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \
217static type *lookup_##name(struct rb_root *root, \
218 typeof(((type *)0)->keyfld) key) \
219{ \
220 struct rb_node *n = root->rb_node; \
221 \
222 while (n) { \
223 type *cur = rb_entry(n, type, nodefld); \
224 \
225 if (key < cur->keyfld) \
226 n = n->rb_left; \
227 else if (key > cur->keyfld) \
228 n = n->rb_right; \
229 else \
230 return cur; \
231 } \
232 \
233 return NULL; \
234}
235
236#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \
237DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
238DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
239
183extern struct kmem_cache *ceph_inode_cachep; 240extern struct kmem_cache *ceph_inode_cachep;
184extern struct kmem_cache *ceph_cap_cachep; 241extern struct kmem_cache *ceph_cap_cachep;
185extern struct kmem_cache *ceph_cap_flush_cachep; 242extern struct kmem_cache *ceph_cap_flush_cachep;
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index e230e7ed60d3..e2a92df08b47 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -39,20 +39,31 @@ struct ceph_mon_request {
39 ceph_monc_request_func_t do_request; 39 ceph_monc_request_func_t do_request;
40}; 40};
41 41
42typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *);
43
42/* 44/*
43 * ceph_mon_generic_request is being used for the statfs and 45 * ceph_mon_generic_request is being used for the statfs and
44 * mon_get_version requests which are being done a bit differently 46 * mon_get_version requests which are being done a bit differently
45 * because we need to get data back to the caller 47 * because we need to get data back to the caller
46 */ 48 */
47struct ceph_mon_generic_request { 49struct ceph_mon_generic_request {
50 struct ceph_mon_client *monc;
48 struct kref kref; 51 struct kref kref;
49 u64 tid; 52 u64 tid;
50 struct rb_node node; 53 struct rb_node node;
51 int result; 54 int result;
52 void *buf; 55
53 struct completion completion; 56 struct completion completion;
57 ceph_monc_callback_t complete_cb;
58 u64 private_data; /* r_tid/linger_id */
59
54 struct ceph_msg *request; /* original request */ 60 struct ceph_msg *request; /* original request */
55 struct ceph_msg *reply; /* and reply */ 61 struct ceph_msg *reply; /* and reply */
62
63 union {
64 struct ceph_statfs *st;
65 u64 newest;
66 } u;
56}; 67};
57 68
58struct ceph_mon_client { 69struct ceph_mon_client {
@@ -77,7 +88,6 @@ struct ceph_mon_client {
77 88
78 /* pending generic requests */ 89 /* pending generic requests */
79 struct rb_root generic_request_tree; 90 struct rb_root generic_request_tree;
80 int num_generic_requests;
81 u64 last_tid; 91 u64 last_tid;
82 92
83 /* subs, indexed with CEPH_SUB_* */ 93 /* subs, indexed with CEPH_SUB_* */
@@ -86,6 +96,7 @@ struct ceph_mon_client {
86 bool want; 96 bool want;
87 u32 have; /* epoch */ 97 u32 have; /* epoch */
88 } subs[3]; 98 } subs[3];
99 int fs_cluster_id; /* "mdsmap.<id>" sub */
89 100
90#ifdef CONFIG_DEBUG_FS 101#ifdef CONFIG_DEBUG_FS
91 struct dentry *debugfs_file; 102 struct dentry *debugfs_file;
@@ -116,16 +127,18 @@ extern const char *ceph_sub_str[];
116bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, 127bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
117 bool continuous); 128 bool continuous);
118void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); 129void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
130void ceph_monc_renew_subs(struct ceph_mon_client *monc);
119 131
120extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
121extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, 132extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
122 unsigned long timeout); 133 unsigned long timeout);
123 134
124extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, 135extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
125 struct ceph_statfs *buf); 136 struct ceph_statfs *buf);
126 137
127extern int ceph_monc_do_get_version(struct ceph_mon_client *monc, 138int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
128 const char *what, u64 *newest); 139 u64 *newest);
140int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
141 ceph_monc_callback_t cb, u64 private_data);
129 142
130extern int ceph_monc_open_session(struct ceph_mon_client *monc); 143extern int ceph_monc_open_session(struct ceph_mon_client *monc);
131 144
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index cbf460927c42..19b14862d3e0 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -20,10 +20,11 @@ struct ceph_osd_client;
20/* 20/*
21 * completion callback for async writepages 21 * completion callback for async writepages
22 */ 22 */
23typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, 23typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *);
24 struct ceph_msg *);
25typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); 24typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
26 25
26#define CEPH_HOMELESS_OSD -1
27
27/* a given osd we're communicating with */ 28/* a given osd we're communicating with */
28struct ceph_osd { 29struct ceph_osd {
29 atomic_t o_ref; 30 atomic_t o_ref;
@@ -32,16 +33,15 @@ struct ceph_osd {
32 int o_incarnation; 33 int o_incarnation;
33 struct rb_node o_node; 34 struct rb_node o_node;
34 struct ceph_connection o_con; 35 struct ceph_connection o_con;
35 struct list_head o_requests; 36 struct rb_root o_requests;
36 struct list_head o_linger_requests; 37 struct rb_root o_linger_requests;
37 struct list_head o_osd_lru; 38 struct list_head o_osd_lru;
38 struct ceph_auth_handshake o_auth; 39 struct ceph_auth_handshake o_auth;
39 unsigned long lru_ttl; 40 unsigned long lru_ttl;
40 int o_marked_for_keepalive;
41 struct list_head o_keepalive_item; 41 struct list_head o_keepalive_item;
42 struct mutex lock;
42}; 43};
43 44
44
45#define CEPH_OSD_SLAB_OPS 2 45#define CEPH_OSD_SLAB_OPS 2
46#define CEPH_OSD_MAX_OPS 16 46#define CEPH_OSD_MAX_OPS 16
47 47
@@ -104,76 +104,95 @@ struct ceph_osd_req_op {
104 struct ceph_osd_data response_data; 104 struct ceph_osd_data response_data;
105 __u8 class_len; 105 __u8 class_len;
106 __u8 method_len; 106 __u8 method_len;
107 __u8 argc; 107 u32 indata_len;
108 } cls; 108 } cls;
109 struct { 109 struct {
110 u64 cookie; 110 u64 cookie;
111 u64 ver; 111 __u8 op; /* CEPH_OSD_WATCH_OP_ */
112 u32 prot_ver; 112 u32 gen;
113 u32 timeout;
114 __u8 flag;
115 } watch; 113 } watch;
116 struct { 114 struct {
115 struct ceph_osd_data request_data;
116 } notify_ack;
117 struct {
118 u64 cookie;
119 struct ceph_osd_data request_data;
120 struct ceph_osd_data response_data;
121 } notify;
122 struct {
117 u64 expected_object_size; 123 u64 expected_object_size;
118 u64 expected_write_size; 124 u64 expected_write_size;
119 } alloc_hint; 125 } alloc_hint;
120 }; 126 };
121}; 127};
122 128
129struct ceph_osd_request_target {
130 struct ceph_object_id base_oid;
131 struct ceph_object_locator base_oloc;
132 struct ceph_object_id target_oid;
133 struct ceph_object_locator target_oloc;
134
135 struct ceph_pg pgid;
136 u32 pg_num;
137 u32 pg_num_mask;
138 struct ceph_osds acting;
139 struct ceph_osds up;
140 int size;
141 int min_size;
142 bool sort_bitwise;
143
144 unsigned int flags; /* CEPH_OSD_FLAG_* */
145 bool paused;
146
147 int osd;
148};
149
123/* an in-flight request */ 150/* an in-flight request */
124struct ceph_osd_request { 151struct ceph_osd_request {
125 u64 r_tid; /* unique for this client */ 152 u64 r_tid; /* unique for this client */
126 struct rb_node r_node; 153 struct rb_node r_node;
127 struct list_head r_req_lru_item; 154 struct rb_node r_mc_node; /* map check */
128 struct list_head r_osd_item;
129 struct list_head r_linger_item;
130 struct list_head r_linger_osd_item;
131 struct ceph_osd *r_osd; 155 struct ceph_osd *r_osd;
132 struct ceph_pg r_pgid; 156
133 int r_pg_osds[CEPH_PG_MAX_SIZE]; 157 struct ceph_osd_request_target r_t;
134 int r_num_pg_osds; 158#define r_base_oid r_t.base_oid
159#define r_base_oloc r_t.base_oloc
160#define r_flags r_t.flags
135 161
136 struct ceph_msg *r_request, *r_reply; 162 struct ceph_msg *r_request, *r_reply;
137 int r_flags; /* any additional flags for the osd */
138 u32 r_sent; /* >0 if r_request is sending/sent */ 163 u32 r_sent; /* >0 if r_request is sending/sent */
139 164
140 /* request osd ops array */ 165 /* request osd ops array */
141 unsigned int r_num_ops; 166 unsigned int r_num_ops;
142 167
143 /* these are updated on each send */
144 __le32 *r_request_osdmap_epoch;
145 __le32 *r_request_flags;
146 __le64 *r_request_pool;
147 void *r_request_pgid;
148 __le32 *r_request_attempts;
149 bool r_paused;
150 struct ceph_eversion *r_request_reassert_version;
151
152 int r_result; 168 int r_result;
153 int r_got_reply; 169 bool r_got_reply;
154 int r_linger;
155 170
156 struct ceph_osd_client *r_osdc; 171 struct ceph_osd_client *r_osdc;
157 struct kref r_kref; 172 struct kref r_kref;
158 bool r_mempool; 173 bool r_mempool;
159 struct completion r_completion, r_safe_completion; 174 struct completion r_completion;
175 struct completion r_safe_completion; /* fsync waiter */
160 ceph_osdc_callback_t r_callback; 176 ceph_osdc_callback_t r_callback;
161 ceph_osdc_unsafe_callback_t r_unsafe_callback; 177 ceph_osdc_unsafe_callback_t r_unsafe_callback;
162 struct ceph_eversion r_reassert_version;
163 struct list_head r_unsafe_item; 178 struct list_head r_unsafe_item;
164 179
165 struct inode *r_inode; /* for use by callbacks */ 180 struct inode *r_inode; /* for use by callbacks */
166 void *r_priv; /* ditto */ 181 void *r_priv; /* ditto */
167 182
168 struct ceph_object_locator r_base_oloc; 183 /* set by submitter */
169 struct ceph_object_id r_base_oid; 184 u64 r_snapid; /* for reads, CEPH_NOSNAP o/w */
170 struct ceph_object_locator r_target_oloc; 185 struct ceph_snap_context *r_snapc; /* for writes */
171 struct ceph_object_id r_target_oid; 186 struct timespec r_mtime; /* ditto */
172 187 u64 r_data_offset; /* ditto */
173 u64 r_snapid; 188 bool r_linger; /* don't resend on failure */
174 unsigned long r_stamp; /* send OR check time */
175 189
176 struct ceph_snap_context *r_snapc; /* snap context for writes */ 190 /* internal */
191 unsigned long r_stamp; /* jiffies, send or check time */
192 int r_attempts;
193 struct ceph_eversion r_replay_version; /* aka reassert_version */
194 u32 r_last_force_resend;
195 u32 r_map_dne_bound;
177 196
178 struct ceph_osd_req_op r_ops[]; 197 struct ceph_osd_req_op r_ops[];
179}; 198};
@@ -182,44 +201,70 @@ struct ceph_request_redirect {
182 struct ceph_object_locator oloc; 201 struct ceph_object_locator oloc;
183}; 202};
184 203
185struct ceph_osd_event { 204typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
186 u64 cookie; 205 u64 notifier_id, void *data, size_t data_len);
187 int one_shot; 206typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
207
208struct ceph_osd_linger_request {
188 struct ceph_osd_client *osdc; 209 struct ceph_osd_client *osdc;
189 void (*cb)(u64, u64, u8, void *); 210 u64 linger_id;
190 void *data; 211 bool committed;
191 struct rb_node node; 212 bool is_watch; /* watch or notify */
192 struct list_head osd_node; 213
214 struct ceph_osd *osd;
215 struct ceph_osd_request *reg_req;
216 struct ceph_osd_request *ping_req;
217 unsigned long ping_sent;
218 unsigned long watch_valid_thru;
219 struct list_head pending_lworks;
220
221 struct ceph_osd_request_target t;
222 u32 last_force_resend;
223 u32 map_dne_bound;
224
225 struct timespec mtime;
226
193 struct kref kref; 227 struct kref kref;
194}; 228 struct mutex lock;
229 struct rb_node node; /* osd */
230 struct rb_node osdc_node; /* osdc */
231 struct rb_node mc_node; /* map check */
232 struct list_head scan_item;
233
234 struct completion reg_commit_wait;
235 struct completion notify_finish_wait;
236 int reg_commit_error;
237 int notify_finish_error;
238 int last_error;
239
240 u32 register_gen;
241 u64 notify_id;
242
243 rados_watchcb2_t wcb;
244 rados_watcherrcb_t errcb;
245 void *data;
195 246
196struct ceph_osd_event_work { 247 struct page ***preply_pages;
197 struct work_struct work; 248 size_t *preply_len;
198 struct ceph_osd_event *event;
199 u64 ver;
200 u64 notify_id;
201 u8 opcode;
202}; 249};
203 250
204struct ceph_osd_client { 251struct ceph_osd_client {
205 struct ceph_client *client; 252 struct ceph_client *client;
206 253
207 struct ceph_osdmap *osdmap; /* current map */ 254 struct ceph_osdmap *osdmap; /* current map */
208 struct rw_semaphore map_sem; 255 struct rw_semaphore lock;
209 struct completion map_waiters;
210 u64 last_requested_map;
211 256
212 struct mutex request_mutex;
213 struct rb_root osds; /* osds */ 257 struct rb_root osds; /* osds */
214 struct list_head osd_lru; /* idle osds */ 258 struct list_head osd_lru; /* idle osds */
215 u64 timeout_tid; /* tid of timeout triggering rq */ 259 spinlock_t osd_lru_lock;
216 u64 last_tid; /* tid of last request */ 260 struct ceph_osd homeless_osd;
217 struct rb_root requests; /* pending requests */ 261 atomic64_t last_tid; /* tid of last request */
218 struct list_head req_lru; /* in-flight lru */ 262 u64 last_linger_id;
219 struct list_head req_unsent; /* unsent/need-resend queue */ 263 struct rb_root linger_requests; /* lingering requests */
220 struct list_head req_notarget; /* map to no osd */ 264 struct rb_root map_checks;
221 struct list_head req_linger; /* lingering requests */ 265 struct rb_root linger_map_checks;
222 int num_requests; 266 atomic_t num_requests;
267 atomic_t num_homeless;
223 struct delayed_work timeout_work; 268 struct delayed_work timeout_work;
224 struct delayed_work osds_timeout_work; 269 struct delayed_work osds_timeout_work;
225#ifdef CONFIG_DEBUG_FS 270#ifdef CONFIG_DEBUG_FS
@@ -231,10 +276,6 @@ struct ceph_osd_client {
231 struct ceph_msgpool msgpool_op; 276 struct ceph_msgpool msgpool_op;
232 struct ceph_msgpool msgpool_op_reply; 277 struct ceph_msgpool msgpool_op_reply;
233 278
234 spinlock_t event_lock;
235 struct rb_root event_tree;
236 u64 event_count;
237
238 struct workqueue_struct *notify_wq; 279 struct workqueue_struct *notify_wq;
239}; 280};
240 281
@@ -271,9 +312,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
271extern struct ceph_osd_data *osd_req_op_extent_osd_data( 312extern struct ceph_osd_data *osd_req_op_extent_osd_data(
272 struct ceph_osd_request *osd_req, 313 struct ceph_osd_request *osd_req,
273 unsigned int which); 314 unsigned int which);
274extern struct ceph_osd_data *osd_req_op_cls_response_data(
275 struct ceph_osd_request *osd_req,
276 unsigned int which);
277 315
278extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, 316extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
279 unsigned int which, 317 unsigned int which,
@@ -309,9 +347,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
309extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, 347extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
310 u16 opcode, const char *name, const void *value, 348 u16 opcode, const char *name, const void *value,
311 size_t size, u8 cmp_op, u8 cmp_mode); 349 size_t size, u8 cmp_op, u8 cmp_mode);
312extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
313 unsigned int which, u16 opcode,
314 u64 cookie, u64 version, int flag);
315extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, 350extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
316 unsigned int which, 351 unsigned int which,
317 u64 expected_object_size, 352 u64 expected_object_size,
@@ -322,11 +357,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
322 unsigned int num_ops, 357 unsigned int num_ops,
323 bool use_mempool, 358 bool use_mempool,
324 gfp_t gfp_flags); 359 gfp_t gfp_flags);
325 360int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp);
326extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
327 struct ceph_snap_context *snapc,
328 u64 snap_id,
329 struct timespec *mtime);
330 361
331extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, 362extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
332 struct ceph_file_layout *layout, 363 struct ceph_file_layout *layout,
@@ -338,9 +369,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
338 u32 truncate_seq, u64 truncate_size, 369 u32 truncate_seq, u64 truncate_size,
339 bool use_mempool); 370 bool use_mempool);
340 371
341extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
342 struct ceph_osd_request *req);
343
344extern void ceph_osdc_get_request(struct ceph_osd_request *req); 372extern void ceph_osdc_get_request(struct ceph_osd_request *req);
345extern void ceph_osdc_put_request(struct ceph_osd_request *req); 373extern void ceph_osdc_put_request(struct ceph_osd_request *req);
346 374
@@ -353,6 +381,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
353extern void ceph_osdc_sync(struct ceph_osd_client *osdc); 381extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
354 382
355extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc); 383extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
384void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc);
356 385
357extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, 386extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
358 struct ceph_vino vino, 387 struct ceph_vino vino,
@@ -371,11 +400,33 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
371 struct timespec *mtime, 400 struct timespec *mtime,
372 struct page **pages, int nr_pages); 401 struct page **pages, int nr_pages);
373 402
374/* watch/notify events */ 403/* watch/notify */
375extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, 404struct ceph_osd_linger_request *
376 void (*event_cb)(u64, u64, u8, void *), 405ceph_osdc_watch(struct ceph_osd_client *osdc,
377 void *data, struct ceph_osd_event **pevent); 406 struct ceph_object_id *oid,
378extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); 407 struct ceph_object_locator *oloc,
379extern void ceph_osdc_put_event(struct ceph_osd_event *event); 408 rados_watchcb2_t wcb,
409 rados_watcherrcb_t errcb,
410 void *data);
411int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
412 struct ceph_osd_linger_request *lreq);
413
414int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
415 struct ceph_object_id *oid,
416 struct ceph_object_locator *oloc,
417 u64 notify_id,
418 u64 cookie,
419 void *payload,
420 size_t payload_len);
421int ceph_osdc_notify(struct ceph_osd_client *osdc,
422 struct ceph_object_id *oid,
423 struct ceph_object_locator *oloc,
424 void *payload,
425 size_t payload_len,
426 u32 timeout,
427 struct page ***preply_pages,
428 size_t *preply_len);
429int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
430 struct ceph_osd_linger_request *lreq);
380#endif 431#endif
381 432
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index e55c08bc3a96..ddc426b22d81 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -24,21 +24,29 @@ struct ceph_pg {
24 uint32_t seed; 24 uint32_t seed;
25}; 25};
26 26
27#define CEPH_POOL_FLAG_HASHPSPOOL 1 27int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
28
29#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id
30 together */
31#define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */
28 32
29struct ceph_pg_pool_info { 33struct ceph_pg_pool_info {
30 struct rb_node node; 34 struct rb_node node;
31 s64 id; 35 s64 id;
32 u8 type; 36 u8 type; /* CEPH_POOL_TYPE_* */
33 u8 size; 37 u8 size;
38 u8 min_size;
34 u8 crush_ruleset; 39 u8 crush_ruleset;
35 u8 object_hash; 40 u8 object_hash;
41 u32 last_force_request_resend;
36 u32 pg_num, pgp_num; 42 u32 pg_num, pgp_num;
37 int pg_num_mask, pgp_num_mask; 43 int pg_num_mask, pgp_num_mask;
38 s64 read_tier; 44 s64 read_tier;
39 s64 write_tier; /* wins for read+write ops */ 45 s64 write_tier; /* wins for read+write ops */
40 u64 flags; 46 u64 flags; /* CEPH_POOL_FLAG_* */
41 char *name; 47 char *name;
48
49 bool was_full; /* for handle_one_map() */
42}; 50};
43 51
44static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) 52static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
@@ -57,6 +65,22 @@ struct ceph_object_locator {
57 s64 pool; 65 s64 pool;
58}; 66};
59 67
68static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
69{
70 oloc->pool = -1;
71}
72
73static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
74{
75 return oloc->pool == -1;
76}
77
78static inline void ceph_oloc_copy(struct ceph_object_locator *dest,
79 const struct ceph_object_locator *src)
80{
81 dest->pool = src->pool;
82}
83
60/* 84/*
61 * Maximum supported by kernel client object name length 85 * Maximum supported by kernel client object name length
62 * 86 *
@@ -64,11 +88,47 @@ struct ceph_object_locator {
64 */ 88 */
65#define CEPH_MAX_OID_NAME_LEN 100 89#define CEPH_MAX_OID_NAME_LEN 100
66 90
91/*
92 * 51-char inline_name is long enough for all cephfs and all but one
93 * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
94 * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all
95 * other rbd requests fit into inline_name.
96 *
97 * Makes ceph_object_id 64 bytes on 64-bit.
98 */
99#define CEPH_OID_INLINE_LEN 52
100
101/*
102 * Both inline and external buffers have space for a NUL-terminator,
103 * which is carried around. It's not required though - RADOS object
104 * names don't have to be NUL-terminated and may contain NULs.
105 */
67struct ceph_object_id { 106struct ceph_object_id {
68 char name[CEPH_MAX_OID_NAME_LEN]; 107 char *name;
108 char inline_name[CEPH_OID_INLINE_LEN];
69 int name_len; 109 int name_len;
70}; 110};
71 111
112static inline void ceph_oid_init(struct ceph_object_id *oid)
113{
114 oid->name = oid->inline_name;
115 oid->name_len = 0;
116}
117
118static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
119{
120 return oid->name == oid->inline_name && !oid->name_len;
121}
122
123void ceph_oid_copy(struct ceph_object_id *dest,
124 const struct ceph_object_id *src);
125__printf(2, 3)
126void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
127__printf(3, 4)
128int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
129 const char *fmt, ...);
130void ceph_oid_destroy(struct ceph_object_id *oid);
131
72struct ceph_pg_mapping { 132struct ceph_pg_mapping {
73 struct rb_node node; 133 struct rb_node node;
74 struct ceph_pg pgid; 134 struct ceph_pg pgid;
@@ -87,7 +147,6 @@ struct ceph_pg_mapping {
87struct ceph_osdmap { 147struct ceph_osdmap {
88 struct ceph_fsid fsid; 148 struct ceph_fsid fsid;
89 u32 epoch; 149 u32 epoch;
90 u32 mkfs_epoch;
91 struct ceph_timespec created, modified; 150 struct ceph_timespec created, modified;
92 151
93 u32 flags; /* CEPH_OSDMAP_* */ 152 u32 flags; /* CEPH_OSDMAP_* */
@@ -113,43 +172,19 @@ struct ceph_osdmap {
113 int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; 172 int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
114}; 173};
115 174
116static inline void ceph_oid_set_name(struct ceph_object_id *oid, 175static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
117 const char *name)
118{
119 int len;
120
121 len = strlen(name);
122 if (len > sizeof(oid->name)) {
123 WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n",
124 name, len, sizeof(oid->name));
125 len = sizeof(oid->name);
126 }
127
128 memcpy(oid->name, name, len);
129 oid->name_len = len;
130}
131
132static inline void ceph_oid_copy(struct ceph_object_id *dest,
133 struct ceph_object_id *src)
134{
135 BUG_ON(src->name_len > sizeof(dest->name));
136 memcpy(dest->name, src->name, src->name_len);
137 dest->name_len = src->name_len;
138}
139
140static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd)
141{ 176{
142 return osd >= 0 && osd < map->max_osd && 177 return osd >= 0 && osd < map->max_osd &&
143 (map->osd_state[osd] & CEPH_OSD_EXISTS); 178 (map->osd_state[osd] & CEPH_OSD_EXISTS);
144} 179}
145 180
146static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) 181static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
147{ 182{
148 return ceph_osd_exists(map, osd) && 183 return ceph_osd_exists(map, osd) &&
149 (map->osd_state[osd] & CEPH_OSD_UP); 184 (map->osd_state[osd] & CEPH_OSD_UP);
150} 185}
151 186
152static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd) 187static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
153{ 188{
154 return !ceph_osd_is_up(map, osd); 189 return !ceph_osd_is_up(map, osd);
155} 190}
@@ -192,28 +227,59 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
192 return 0; 227 return 0;
193} 228}
194 229
230struct ceph_osdmap *ceph_osdmap_alloc(void);
195extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); 231extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
196extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, 232struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
197 struct ceph_osdmap *map, 233 struct ceph_osdmap *map);
198 struct ceph_messenger *msgr);
199extern void ceph_osdmap_destroy(struct ceph_osdmap *map); 234extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
200 235
236struct ceph_osds {
237 int osds[CEPH_PG_MAX_SIZE];
238 int size;
239 int primary; /* id, NOT index */
240};
241
242static inline void ceph_osds_init(struct ceph_osds *set)
243{
244 set->size = 0;
245 set->primary = -1;
246}
247
248void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
249
250bool ceph_is_new_interval(const struct ceph_osds *old_acting,
251 const struct ceph_osds *new_acting,
252 const struct ceph_osds *old_up,
253 const struct ceph_osds *new_up,
254 int old_size,
255 int new_size,
256 int old_min_size,
257 int new_min_size,
258 u32 old_pg_num,
259 u32 new_pg_num,
260 bool old_sort_bitwise,
261 bool new_sort_bitwise,
262 const struct ceph_pg *pgid);
263bool ceph_osds_changed(const struct ceph_osds *old_acting,
264 const struct ceph_osds *new_acting,
265 bool any_change);
266
201/* calculate mapping of a file extent to an object */ 267/* calculate mapping of a file extent to an object */
202extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 268extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
203 u64 off, u64 len, 269 u64 off, u64 len,
204 u64 *bno, u64 *oxoff, u64 *oxlen); 270 u64 *bno, u64 *oxoff, u64 *oxlen);
205 271
206/* calculate mapping of object to a placement group */ 272int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
207extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, 273 struct ceph_object_id *oid,
208 struct ceph_object_locator *oloc, 274 struct ceph_object_locator *oloc,
209 struct ceph_object_id *oid, 275 struct ceph_pg *raw_pgid);
210 struct ceph_pg *pg_out); 276
211 277void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
212extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, 278 const struct ceph_pg *raw_pgid,
213 struct ceph_pg pgid, 279 struct ceph_osds *up,
214 int *osds, int *primary); 280 struct ceph_osds *acting);
215extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, 281int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
216 struct ceph_pg pgid); 282 const struct ceph_pg *raw_pgid);
217 283
218extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, 284extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
219 u64 id); 285 u64 id);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index 2f822dca1046..5c0da61cb763 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -114,8 +114,8 @@ struct ceph_object_layout {
114 * compound epoch+version, used by storage layer to serialize mutations 114 * compound epoch+version, used by storage layer to serialize mutations
115 */ 115 */
116struct ceph_eversion { 116struct ceph_eversion {
117 __le32 epoch;
118 __le64 version; 117 __le64 version;
118 __le32 epoch;
119} __attribute__ ((packed)); 119} __attribute__ ((packed));
120 120
121/* 121/*
@@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s);
153#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ 153#define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */
154#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ 154#define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */
155#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ 155#define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */
156#define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */
157#define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */
158#define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */
159#define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */
160#define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */
156 161
157/* 162/*
158 * The error code to return when an OSD can't handle a write 163 * The error code to return when an OSD can't handle a write
@@ -389,6 +394,13 @@ enum {
389 CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ 394 CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */
390 CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */ 395 CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */
391 CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ 396 CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */
397 CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000, /* map snap direct to clone id */
398 CEPH_OSD_FLAG_ENFORCE_SNAPC = 0x100000, /* use snapc provided even if
399 pool uses pool snaps */
400 CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */
401 CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */
402 CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */
403 CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */
392}; 404};
393 405
394enum { 406enum {
@@ -415,7 +427,17 @@ enum {
415 CEPH_OSD_CMPXATTR_MODE_U64 = 2 427 CEPH_OSD_CMPXATTR_MODE_U64 = 2
416}; 428};
417 429
418#define RADOS_NOTIFY_VER 1 430enum {
431 CEPH_OSD_WATCH_OP_UNWATCH = 0,
432 CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
433 /* note: use only ODD ids to prevent pre-giant code from
434 interpreting the op as UNWATCH */
435 CEPH_OSD_WATCH_OP_WATCH = 3,
436 CEPH_OSD_WATCH_OP_RECONNECT = 5,
437 CEPH_OSD_WATCH_OP_PING = 7,
438};
439
440const char *ceph_osd_watch_op_name(int o);
419 441
420/* 442/*
421 * an individual object operation. each may be accompanied by some data 443 * an individual object operation. each may be accompanied by some data
@@ -450,10 +472,14 @@ struct ceph_osd_op {
450 } __attribute__ ((packed)) snap; 472 } __attribute__ ((packed)) snap;
451 struct { 473 struct {
452 __le64 cookie; 474 __le64 cookie;
453 __le64 ver; 475 __le64 ver; /* no longer used */
454 __u8 flag; /* 0 = unwatch, 1 = watch */ 476 __u8 op; /* CEPH_OSD_WATCH_OP_* */
477 __le32 gen; /* registration generation */
455 } __attribute__ ((packed)) watch; 478 } __attribute__ ((packed)) watch;
456 struct { 479 struct {
480 __le64 cookie;
481 } __attribute__ ((packed)) notify;
482 struct {
457 __le64 offset, length; 483 __le64 offset, length;
458 __le64 src_offset; 484 __le64 src_offset;
459 } __attribute__ ((packed)) clonerange; 485 } __attribute__ ((packed)) clonerange;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index dcc18c6f7cf9..55d2bfee16d7 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client);
651/* 651/*
652 * true if we have the mon map (and have thus joined the cluster) 652 * true if we have the mon map (and have thus joined the cluster)
653 */ 653 */
654static int have_mon_and_osd_map(struct ceph_client *client) 654static bool have_mon_and_osd_map(struct ceph_client *client)
655{ 655{
656 return client->monc.monmap && client->monc.monmap->epoch && 656 return client->monc.monmap && client->monc.monmap->epoch &&
657 client->osdc.osdmap && client->osdc.osdmap->epoch; 657 client->osdc.osdmap && client->osdc.osdmap->epoch;
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 139a9cb19b0c..3773a4fa11e3 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
27 } 27 }
28} 28}
29 29
30const char *ceph_osd_watch_op_name(int o)
31{
32 switch (o) {
33 case CEPH_OSD_WATCH_OP_UNWATCH:
34 return "unwatch";
35 case CEPH_OSD_WATCH_OP_WATCH:
36 return "watch";
37 case CEPH_OSD_WATCH_OP_RECONNECT:
38 return "reconnect";
39 case CEPH_OSD_WATCH_OP_PING:
40 return "ping";
41 default:
42 return "???";
43 }
44}
45
30const char *ceph_osd_state_name(int s) 46const char *ceph_osd_state_name(int s)
31{ 47{
32 switch (s) { 48 switch (s) {
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index b902fbc7863e..e77b04ca7802 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -54,24 +54,25 @@ static int osdmap_show(struct seq_file *s, void *p)
54{ 54{
55 int i; 55 int i;
56 struct ceph_client *client = s->private; 56 struct ceph_client *client = s->private;
57 struct ceph_osdmap *map = client->osdc.osdmap; 57 struct ceph_osd_client *osdc = &client->osdc;
58 struct ceph_osdmap *map = osdc->osdmap;
58 struct rb_node *n; 59 struct rb_node *n;
59 60
60 if (map == NULL) 61 if (map == NULL)
61 return 0; 62 return 0;
62 63
63 seq_printf(s, "epoch %d\n", map->epoch); 64 down_read(&osdc->lock);
64 seq_printf(s, "flags%s%s\n", 65 seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags);
65 (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "",
66 (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : "");
67 66
68 for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { 67 for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
69 struct ceph_pg_pool_info *pool = 68 struct ceph_pg_pool_info *pi =
70 rb_entry(n, struct ceph_pg_pool_info, node); 69 rb_entry(n, struct ceph_pg_pool_info, node);
71 70
72 seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n", 71 seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n",
73 pool->id, pool->pg_num, pool->pg_num_mask, 72 pi->id, pi->name, pi->type, pi->size, pi->min_size,
74 pool->read_tier, pool->write_tier); 73 pi->pg_num, pi->pg_num_mask, pi->flags,
74 pi->last_force_request_resend, pi->read_tier,
75 pi->write_tier);
75 } 76 }
76 for (i = 0; i < map->max_osd; i++) { 77 for (i = 0; i < map->max_osd; i++) {
77 struct ceph_entity_addr *addr = &map->osd_addr[i]; 78 struct ceph_entity_addr *addr = &map->osd_addr[i];
@@ -103,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p)
103 pg->pgid.seed, pg->primary_temp.osd); 104 pg->pgid.seed, pg->primary_temp.osd);
104 } 105 }
105 106
107 up_read(&osdc->lock);
106 return 0; 108 return 0;
107} 109}
108 110
@@ -126,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p)
126 CEPH_SUBSCRIBE_ONETIME ? "" : "+")); 128 CEPH_SUBSCRIBE_ONETIME ? "" : "+"));
127 seq_putc(s, '\n'); 129 seq_putc(s, '\n');
128 } 130 }
131 seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id);
129 132
130 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { 133 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
131 __u16 op; 134 __u16 op;
@@ -143,43 +146,113 @@ static int monc_show(struct seq_file *s, void *p)
143 return 0; 146 return 0;
144} 147}
145 148
146static int osdc_show(struct seq_file *s, void *pp) 149static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
147{ 150{
148 struct ceph_client *client = s->private; 151 int i;
149 struct ceph_osd_client *osdc = &client->osdc;
150 struct rb_node *p;
151 152
152 mutex_lock(&osdc->request_mutex); 153 seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed);
153 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { 154 for (i = 0; i < t->up.size; i++)
154 struct ceph_osd_request *req; 155 seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
155 unsigned int i; 156 seq_printf(s, "]/%d\t[", t->up.primary);
156 int opcode; 157 for (i = 0; i < t->acting.size; i++)
158 seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
159 seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
160 t->target_oid.name_len, t->target_oid.name, t->flags);
161 if (t->paused)
162 seq_puts(s, "\tP");
163}
157 164
158 req = rb_entry(p, struct ceph_osd_request, r_node); 165static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
166{
167 int i;
159 168
160 seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid, 169 seq_printf(s, "%llu\t", req->r_tid);
161 req->r_osd ? req->r_osd->o_osd : -1, 170 dump_target(s, &req->r_t);
162 req->r_pgid.pool, req->r_pgid.seed);
163 171
164 seq_printf(s, "%.*s", req->r_base_oid.name_len, 172 seq_printf(s, "\t%d\t%u'%llu", req->r_attempts,
165 req->r_base_oid.name); 173 le32_to_cpu(req->r_replay_version.epoch),
174 le64_to_cpu(req->r_replay_version.version));
166 175
167 if (req->r_reassert_version.epoch) 176 for (i = 0; i < req->r_num_ops; i++) {
168 seq_printf(s, "\t%u'%llu", 177 struct ceph_osd_req_op *op = &req->r_ops[i];
169 (unsigned int)le32_to_cpu(req->r_reassert_version.epoch), 178
170 le64_to_cpu(req->r_reassert_version.version)); 179 seq_printf(s, "%s%s", (i == 0 ? "\t" : ","),
171 else 180 ceph_osd_op_name(op->op));
172 seq_printf(s, "\t"); 181 if (op->op == CEPH_OSD_OP_WATCH)
182 seq_printf(s, "-%s",
183 ceph_osd_watch_op_name(op->watch.op));
184 }
185
186 seq_putc(s, '\n');
187}
188
189static void dump_requests(struct seq_file *s, struct ceph_osd *osd)
190{
191 struct rb_node *n;
192
193 mutex_lock(&osd->lock);
194 for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
195 struct ceph_osd_request *req =
196 rb_entry(n, struct ceph_osd_request, r_node);
197
198 dump_request(s, req);
199 }
200
201 mutex_unlock(&osd->lock);
202}
173 203
174 for (i = 0; i < req->r_num_ops; i++) { 204static void dump_linger_request(struct seq_file *s,
175 opcode = req->r_ops[i].op; 205 struct ceph_osd_linger_request *lreq)
176 seq_printf(s, "%s%s", (i == 0 ? "\t" : ","), 206{
177 ceph_osd_op_name(opcode)); 207 seq_printf(s, "%llu\t", lreq->linger_id);
178 } 208 dump_target(s, &lreq->t);
209
210 seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen,
211 lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "",
212 lreq->last_error);
213}
214
215static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
216{
217 struct rb_node *n;
218
219 mutex_lock(&osd->lock);
220 for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
221 struct ceph_osd_linger_request *lreq =
222 rb_entry(n, struct ceph_osd_linger_request, node);
223
224 dump_linger_request(s, lreq);
225 }
226
227 mutex_unlock(&osd->lock);
228}
179 229
180 seq_printf(s, "\n"); 230static int osdc_show(struct seq_file *s, void *pp)
231{
232 struct ceph_client *client = s->private;
233 struct ceph_osd_client *osdc = &client->osdc;
234 struct rb_node *n;
235
236 down_read(&osdc->lock);
237 seq_printf(s, "REQUESTS %d homeless %d\n",
238 atomic_read(&osdc->num_requests),
239 atomic_read(&osdc->num_homeless));
240 for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
241 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
242
243 dump_requests(s, osd);
181 } 244 }
182 mutex_unlock(&osdc->request_mutex); 245 dump_requests(s, &osdc->homeless_osd);
246
247 seq_puts(s, "LINGER REQUESTS\n");
248 for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
249 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
250
251 dump_linger_requests(s, osd);
252 }
253 dump_linger_requests(s, &osdc->homeless_osd);
254
255 up_read(&osdc->lock);
183 return 0; 256 return 0;
184} 257}
185 258
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index cf638c009cfa..37c38a7fb5c5 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc)
260 BUG_ON(num < 1); /* monmap sub is always there */ 260 BUG_ON(num < 1); /* monmap sub is always there */
261 ceph_encode_32(&p, num); 261 ceph_encode_32(&p, num);
262 for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { 262 for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
263 const char *s = ceph_sub_str[i]; 263 char buf[32];
264 int len;
264 265
265 if (!monc->subs[i].want) 266 if (!monc->subs[i].want)
266 continue; 267 continue;
267 268
268 dout("%s %s start %llu flags 0x%x\n", __func__, s, 269 len = sprintf(buf, "%s", ceph_sub_str[i]);
270 if (i == CEPH_SUB_MDSMAP &&
271 monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE)
272 len += sprintf(buf + len, ".%d", monc->fs_cluster_id);
273
274 dout("%s %s start %llu flags 0x%x\n", __func__, buf,
269 le64_to_cpu(monc->subs[i].item.start), 275 le64_to_cpu(monc->subs[i].item.start),
270 monc->subs[i].item.flags); 276 monc->subs[i].item.flags);
271 ceph_encode_string(&p, end, s, strlen(s)); 277 ceph_encode_string(&p, end, buf, len);
272 memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item)); 278 memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
273 p += sizeof(monc->subs[i].item); 279 p += sizeof(monc->subs[i].item);
274 } 280 }
275 281
276 BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19)); 282 BUG_ON(p > end);
277 msg->front.iov_len = p - msg->front.iov_base; 283 msg->front.iov_len = p - msg->front.iov_base;
278 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 284 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
279 ceph_msg_revoke(msg); 285 ceph_msg_revoke(msg);
@@ -376,19 +382,13 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
376} 382}
377EXPORT_SYMBOL(ceph_monc_got_map); 383EXPORT_SYMBOL(ceph_monc_got_map);
378 384
379/* 385void ceph_monc_renew_subs(struct ceph_mon_client *monc)
380 * Register interest in the next osdmap
381 */
382void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
383{ 386{
384 dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
385 mutex_lock(&monc->mutex); 387 mutex_lock(&monc->mutex);
386 if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, 388 __send_subscribe(monc);
387 monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
388 __send_subscribe(monc);
389 mutex_unlock(&monc->mutex); 389 mutex_unlock(&monc->mutex);
390} 390}
391EXPORT_SYMBOL(ceph_monc_request_next_osdmap); 391EXPORT_SYMBOL(ceph_monc_renew_subs);
392 392
393/* 393/*
394 * Wait for an osdmap with a given epoch. 394 * Wait for an osdmap with a given epoch.
@@ -478,51 +478,17 @@ out:
478/* 478/*
479 * generic requests (currently statfs, mon_get_version) 479 * generic requests (currently statfs, mon_get_version)
480 */ 480 */
481static struct ceph_mon_generic_request *__lookup_generic_req( 481DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node)
482 struct ceph_mon_client *monc, u64 tid)
483{
484 struct ceph_mon_generic_request *req;
485 struct rb_node *n = monc->generic_request_tree.rb_node;
486
487 while (n) {
488 req = rb_entry(n, struct ceph_mon_generic_request, node);
489 if (tid < req->tid)
490 n = n->rb_left;
491 else if (tid > req->tid)
492 n = n->rb_right;
493 else
494 return req;
495 }
496 return NULL;
497}
498
499static void __insert_generic_request(struct ceph_mon_client *monc,
500 struct ceph_mon_generic_request *new)
501{
502 struct rb_node **p = &monc->generic_request_tree.rb_node;
503 struct rb_node *parent = NULL;
504 struct ceph_mon_generic_request *req = NULL;
505
506 while (*p) {
507 parent = *p;
508 req = rb_entry(parent, struct ceph_mon_generic_request, node);
509 if (new->tid < req->tid)
510 p = &(*p)->rb_left;
511 else if (new->tid > req->tid)
512 p = &(*p)->rb_right;
513 else
514 BUG();
515 }
516
517 rb_link_node(&new->node, parent, p);
518 rb_insert_color(&new->node, &monc->generic_request_tree);
519}
520 482
521static void release_generic_request(struct kref *kref) 483static void release_generic_request(struct kref *kref)
522{ 484{
523 struct ceph_mon_generic_request *req = 485 struct ceph_mon_generic_request *req =
524 container_of(kref, struct ceph_mon_generic_request, kref); 486 container_of(kref, struct ceph_mon_generic_request, kref);
525 487
488 dout("%s greq %p request %p reply %p\n", __func__, req, req->request,
489 req->reply);
490 WARN_ON(!RB_EMPTY_NODE(&req->node));
491
526 if (req->reply) 492 if (req->reply)
527 ceph_msg_put(req->reply); 493 ceph_msg_put(req->reply);
528 if (req->request) 494 if (req->request)
@@ -533,7 +499,8 @@ static void release_generic_request(struct kref *kref)
533 499
534static void put_generic_request(struct ceph_mon_generic_request *req) 500static void put_generic_request(struct ceph_mon_generic_request *req)
535{ 501{
536 kref_put(&req->kref, release_generic_request); 502 if (req)
503 kref_put(&req->kref, release_generic_request);
537} 504}
538 505
539static void get_generic_request(struct ceph_mon_generic_request *req) 506static void get_generic_request(struct ceph_mon_generic_request *req)
@@ -541,6 +508,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req)
541 kref_get(&req->kref); 508 kref_get(&req->kref);
542} 509}
543 510
511static struct ceph_mon_generic_request *
512alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp)
513{
514 struct ceph_mon_generic_request *req;
515
516 req = kzalloc(sizeof(*req), gfp);
517 if (!req)
518 return NULL;
519
520 req->monc = monc;
521 kref_init(&req->kref);
522 RB_CLEAR_NODE(&req->node);
523 init_completion(&req->completion);
524
525 dout("%s greq %p\n", __func__, req);
526 return req;
527}
528
529static void register_generic_request(struct ceph_mon_generic_request *req)
530{
531 struct ceph_mon_client *monc = req->monc;
532
533 WARN_ON(req->tid);
534
535 get_generic_request(req);
536 req->tid = ++monc->last_tid;
537 insert_generic_request(&monc->generic_request_tree, req);
538}
539
540static void send_generic_request(struct ceph_mon_client *monc,
541 struct ceph_mon_generic_request *req)
542{
543 WARN_ON(!req->tid);
544
545 dout("%s greq %p tid %llu\n", __func__, req, req->tid);
546 req->request->hdr.tid = cpu_to_le64(req->tid);
547 ceph_con_send(&monc->con, ceph_msg_get(req->request));
548}
549
550static void __finish_generic_request(struct ceph_mon_generic_request *req)
551{
552 struct ceph_mon_client *monc = req->monc;
553
554 dout("%s greq %p tid %llu\n", __func__, req, req->tid);
555 erase_generic_request(&monc->generic_request_tree, req);
556
557 ceph_msg_revoke(req->request);
558 ceph_msg_revoke_incoming(req->reply);
559}
560
561static void finish_generic_request(struct ceph_mon_generic_request *req)
562{
563 __finish_generic_request(req);
564 put_generic_request(req);
565}
566
567static void complete_generic_request(struct ceph_mon_generic_request *req)
568{
569 if (req->complete_cb)
570 req->complete_cb(req);
571 else
572 complete_all(&req->completion);
573 put_generic_request(req);
574}
575
576void cancel_generic_request(struct ceph_mon_generic_request *req)
577{
578 struct ceph_mon_client *monc = req->monc;
579 struct ceph_mon_generic_request *lookup_req;
580
581 dout("%s greq %p tid %llu\n", __func__, req, req->tid);
582
583 mutex_lock(&monc->mutex);
584 lookup_req = lookup_generic_request(&monc->generic_request_tree,
585 req->tid);
586 if (lookup_req) {
587 WARN_ON(lookup_req != req);
588 finish_generic_request(req);
589 }
590
591 mutex_unlock(&monc->mutex);
592}
593
594static int wait_generic_request(struct ceph_mon_generic_request *req)
595{
596 int ret;
597
598 dout("%s greq %p tid %llu\n", __func__, req, req->tid);
599 ret = wait_for_completion_interruptible(&req->completion);
600 if (ret)
601 cancel_generic_request(req);
602 else
603 ret = req->result; /* completed */
604
605 return ret;
606}
607
544static struct ceph_msg *get_generic_reply(struct ceph_connection *con, 608static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
545 struct ceph_msg_header *hdr, 609 struct ceph_msg_header *hdr,
546 int *skip) 610 int *skip)
@@ -551,7 +615,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
551 struct ceph_msg *m; 615 struct ceph_msg *m;
552 616
553 mutex_lock(&monc->mutex); 617 mutex_lock(&monc->mutex);
554 req = __lookup_generic_req(monc, tid); 618 req = lookup_generic_request(&monc->generic_request_tree, tid);
555 if (!req) { 619 if (!req) {
556 dout("get_generic_reply %lld dne\n", tid); 620 dout("get_generic_reply %lld dne\n", tid);
557 *skip = 1; 621 *skip = 1;
@@ -570,42 +634,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
570 return m; 634 return m;
571} 635}
572 636
573static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
574 struct ceph_mon_generic_request *req)
575{
576 int err;
577
578 /* register request */
579 req->tid = tid != 0 ? tid : ++monc->last_tid;
580 req->request->hdr.tid = cpu_to_le64(req->tid);
581 __insert_generic_request(monc, req);
582 monc->num_generic_requests++;
583 ceph_con_send(&monc->con, ceph_msg_get(req->request));
584 mutex_unlock(&monc->mutex);
585
586 err = wait_for_completion_interruptible(&req->completion);
587
588 mutex_lock(&monc->mutex);
589 rb_erase(&req->node, &monc->generic_request_tree);
590 monc->num_generic_requests--;
591
592 if (!err)
593 err = req->result;
594 return err;
595}
596
597static int do_generic_request(struct ceph_mon_client *monc,
598 struct ceph_mon_generic_request *req)
599{
600 int err;
601
602 mutex_lock(&monc->mutex);
603 err = __do_generic_request(monc, 0, req);
604 mutex_unlock(&monc->mutex);
605
606 return err;
607}
608
609/* 637/*
610 * statfs 638 * statfs
611 */ 639 */
@@ -616,22 +644,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
616 struct ceph_mon_statfs_reply *reply = msg->front.iov_base; 644 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
617 u64 tid = le64_to_cpu(msg->hdr.tid); 645 u64 tid = le64_to_cpu(msg->hdr.tid);
618 646
647 dout("%s msg %p tid %llu\n", __func__, msg, tid);
648
619 if (msg->front.iov_len != sizeof(*reply)) 649 if (msg->front.iov_len != sizeof(*reply))
620 goto bad; 650 goto bad;
621 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
622 651
623 mutex_lock(&monc->mutex); 652 mutex_lock(&monc->mutex);
624 req = __lookup_generic_req(monc, tid); 653 req = lookup_generic_request(&monc->generic_request_tree, tid);
625 if (req) { 654 if (!req) {
626 *(struct ceph_statfs *)req->buf = reply->st; 655 mutex_unlock(&monc->mutex);
627 req->result = 0; 656 return;
628 get_generic_request(req);
629 } 657 }
658
659 req->result = 0;
660 *req->u.st = reply->st; /* struct */
661 __finish_generic_request(req);
630 mutex_unlock(&monc->mutex); 662 mutex_unlock(&monc->mutex);
631 if (req) { 663
632 complete_all(&req->completion); 664 complete_generic_request(req);
633 put_generic_request(req);
634 }
635 return; 665 return;
636 666
637bad: 667bad:
@@ -646,38 +676,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
646{ 676{
647 struct ceph_mon_generic_request *req; 677 struct ceph_mon_generic_request *req;
648 struct ceph_mon_statfs *h; 678 struct ceph_mon_statfs *h;
649 int err; 679 int ret = -ENOMEM;
650 680
651 req = kzalloc(sizeof(*req), GFP_NOFS); 681 req = alloc_generic_request(monc, GFP_NOFS);
652 if (!req) 682 if (!req)
653 return -ENOMEM; 683 goto out;
654
655 kref_init(&req->kref);
656 req->buf = buf;
657 init_completion(&req->completion);
658 684
659 err = -ENOMEM;
660 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, 685 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
661 true); 686 true);
662 if (!req->request) 687 if (!req->request)
663 goto out; 688 goto out;
664 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, 689
665 true); 690 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true);
666 if (!req->reply) 691 if (!req->reply)
667 goto out; 692 goto out;
668 693
694 req->u.st = buf;
695
696 mutex_lock(&monc->mutex);
697 register_generic_request(req);
669 /* fill out request */ 698 /* fill out request */
670 h = req->request->front.iov_base; 699 h = req->request->front.iov_base;
671 h->monhdr.have_version = 0; 700 h->monhdr.have_version = 0;
672 h->monhdr.session_mon = cpu_to_le16(-1); 701 h->monhdr.session_mon = cpu_to_le16(-1);
673 h->monhdr.session_mon_tid = 0; 702 h->monhdr.session_mon_tid = 0;
674 h->fsid = monc->monmap->fsid; 703 h->fsid = monc->monmap->fsid;
704 send_generic_request(monc, req);
705 mutex_unlock(&monc->mutex);
675 706
676 err = do_generic_request(monc, req); 707 ret = wait_generic_request(req);
677
678out: 708out:
679 put_generic_request(req); 709 put_generic_request(req);
680 return err; 710 return ret;
681} 711}
682EXPORT_SYMBOL(ceph_monc_do_statfs); 712EXPORT_SYMBOL(ceph_monc_do_statfs);
683 713
@@ -690,7 +720,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
690 void *end = p + msg->front_alloc_len; 720 void *end = p + msg->front_alloc_len;
691 u64 handle; 721 u64 handle;
692 722
693 dout("%s %p tid %llu\n", __func__, msg, tid); 723 dout("%s msg %p tid %llu\n", __func__, msg, tid);
694 724
695 ceph_decode_need(&p, end, 2*sizeof(u64), bad); 725 ceph_decode_need(&p, end, 2*sizeof(u64), bad);
696 handle = ceph_decode_64(&p); 726 handle = ceph_decode_64(&p);
@@ -698,77 +728,111 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
698 goto bad; 728 goto bad;
699 729
700 mutex_lock(&monc->mutex); 730 mutex_lock(&monc->mutex);
701 req = __lookup_generic_req(monc, handle); 731 req = lookup_generic_request(&monc->generic_request_tree, handle);
702 if (req) { 732 if (!req) {
703 *(u64 *)req->buf = ceph_decode_64(&p); 733 mutex_unlock(&monc->mutex);
704 req->result = 0; 734 return;
705 get_generic_request(req);
706 } 735 }
736
737 req->result = 0;
738 req->u.newest = ceph_decode_64(&p);
739 __finish_generic_request(req);
707 mutex_unlock(&monc->mutex); 740 mutex_unlock(&monc->mutex);
708 if (req) {
709 complete_all(&req->completion);
710 put_generic_request(req);
711 }
712 741
742 complete_generic_request(req);
713 return; 743 return;
744
714bad: 745bad:
715 pr_err("corrupt mon_get_version reply, tid %llu\n", tid); 746 pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
716 ceph_msg_dump(msg); 747 ceph_msg_dump(msg);
717} 748}
718 749
719/* 750static struct ceph_mon_generic_request *
720 * Send MMonGetVersion and wait for the reply. 751__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
721 * 752 ceph_monc_callback_t cb, u64 private_data)
722 * @what: one of "mdsmap", "osdmap" or "monmap"
723 */
724int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
725 u64 *newest)
726{ 753{
727 struct ceph_mon_generic_request *req; 754 struct ceph_mon_generic_request *req;
728 void *p, *end;
729 u64 tid;
730 int err;
731 755
732 req = kzalloc(sizeof(*req), GFP_NOFS); 756 req = alloc_generic_request(monc, GFP_NOIO);
733 if (!req) 757 if (!req)
734 return -ENOMEM; 758 goto err_put_req;
735
736 kref_init(&req->kref);
737 req->buf = newest;
738 init_completion(&req->completion);
739 759
740 req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, 760 req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
741 sizeof(u64) + sizeof(u32) + strlen(what), 761 sizeof(u64) + sizeof(u32) + strlen(what),
742 GFP_NOFS, true); 762 GFP_NOIO, true);
743 if (!req->request) { 763 if (!req->request)
744 err = -ENOMEM; 764 goto err_put_req;
745 goto out;
746 }
747 765
748 req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024, 766 req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO,
749 GFP_NOFS, true); 767 true);
750 if (!req->reply) { 768 if (!req->reply)
751 err = -ENOMEM; 769 goto err_put_req;
752 goto out;
753 }
754 770
755 p = req->request->front.iov_base; 771 req->complete_cb = cb;
756 end = p + req->request->front_alloc_len; 772 req->private_data = private_data;
757 773
758 /* fill out request */
759 mutex_lock(&monc->mutex); 774 mutex_lock(&monc->mutex);
760 tid = ++monc->last_tid; 775 register_generic_request(req);
761 ceph_encode_64(&p, tid); /* handle */ 776 {
762 ceph_encode_string(&p, end, what, strlen(what)); 777 void *p = req->request->front.iov_base;
778 void *const end = p + req->request->front_alloc_len;
779
780 ceph_encode_64(&p, req->tid); /* handle */
781 ceph_encode_string(&p, end, what, strlen(what));
782 WARN_ON(p != end);
783 }
784 send_generic_request(monc, req);
785 mutex_unlock(&monc->mutex);
763 786
764 err = __do_generic_request(monc, tid, req); 787 return req;
765 788
766 mutex_unlock(&monc->mutex); 789err_put_req:
767out:
768 put_generic_request(req); 790 put_generic_request(req);
769 return err; 791 return ERR_PTR(-ENOMEM);
792}
793
794/*
795 * Send MMonGetVersion and wait for the reply.
796 *
797 * @what: one of "mdsmap", "osdmap" or "monmap"
798 */
799int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
800 u64 *newest)
801{
802 struct ceph_mon_generic_request *req;
803 int ret;
804
805 req = __ceph_monc_get_version(monc, what, NULL, 0);
806 if (IS_ERR(req))
807 return PTR_ERR(req);
808
809 ret = wait_generic_request(req);
810 if (!ret)
811 *newest = req->u.newest;
812
813 put_generic_request(req);
814 return ret;
770} 815}
771EXPORT_SYMBOL(ceph_monc_do_get_version); 816EXPORT_SYMBOL(ceph_monc_get_version);
817
818/*
819 * Send MMonGetVersion,
820 *
821 * @what: one of "mdsmap", "osdmap" or "monmap"
822 */
823int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what,
824 ceph_monc_callback_t cb, u64 private_data)
825{
826 struct ceph_mon_generic_request *req;
827
828 req = __ceph_monc_get_version(monc, what, cb, private_data);
829 if (IS_ERR(req))
830 return PTR_ERR(req);
831
832 put_generic_request(req);
833 return 0;
834}
835EXPORT_SYMBOL(ceph_monc_get_version_async);
772 836
773/* 837/*
774 * Resend pending generic requests. 838 * Resend pending generic requests.
@@ -890,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
890 if (!monc->m_subscribe_ack) 954 if (!monc->m_subscribe_ack)
891 goto out_auth; 955 goto out_auth;
892 956
893 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, 957 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS,
894 true); 958 true);
895 if (!monc->m_subscribe) 959 if (!monc->m_subscribe)
896 goto out_subscribe_ack; 960 goto out_subscribe_ack;
@@ -914,9 +978,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
914 978
915 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); 979 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
916 monc->generic_request_tree = RB_ROOT; 980 monc->generic_request_tree = RB_ROOT;
917 monc->num_generic_requests = 0;
918 monc->last_tid = 0; 981 monc->last_tid = 0;
919 982
983 monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE;
984
920 return 0; 985 return 0;
921 986
922out_auth_reply: 987out_auth_reply:
@@ -954,6 +1019,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
954 1019
955 ceph_auth_destroy(monc->auth); 1020 ceph_auth_destroy(monc->auth);
956 1021
1022 WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree));
1023
957 ceph_msg_put(monc->m_auth); 1024 ceph_msg_put(monc->m_auth);
958 ceph_msg_put(monc->m_auth_reply); 1025 ceph_msg_put(monc->m_auth_reply);
959 ceph_msg_put(monc->m_subscribe); 1026 ceph_msg_put(monc->m_subscribe);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 40a53a70efdf..0160d7d09a1e 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -19,25 +19,12 @@
19#include <linux/ceph/auth.h> 19#include <linux/ceph/auth.h>
20#include <linux/ceph/pagelist.h> 20#include <linux/ceph/pagelist.h>
21 21
22#define OSD_OP_FRONT_LEN 4096
23#define OSD_OPREPLY_FRONT_LEN 512 22#define OSD_OPREPLY_FRONT_LEN 512
24 23
25static struct kmem_cache *ceph_osd_request_cache; 24static struct kmem_cache *ceph_osd_request_cache;
26 25
27static const struct ceph_connection_operations osd_con_ops; 26static const struct ceph_connection_operations osd_con_ops;
28 27
29static void __send_queued(struct ceph_osd_client *osdc);
30static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
31static void __register_request(struct ceph_osd_client *osdc,
32 struct ceph_osd_request *req);
33static void __unregister_request(struct ceph_osd_client *osdc,
34 struct ceph_osd_request *req);
35static void __unregister_linger_request(struct ceph_osd_client *osdc,
36 struct ceph_osd_request *req);
37static void __enqueue_request(struct ceph_osd_request *req);
38static void __send_request(struct ceph_osd_client *osdc,
39 struct ceph_osd_request *req);
40
41/* 28/*
42 * Implement client access to distributed object storage cluster. 29 * Implement client access to distributed object storage cluster.
43 * 30 *
@@ -56,6 +43,52 @@ static void __send_request(struct ceph_osd_client *osdc,
56 * channel with an OSD is reset. 43 * channel with an OSD is reset.
57 */ 44 */
58 45
46static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req);
47static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req);
48static void link_linger(struct ceph_osd *osd,
49 struct ceph_osd_linger_request *lreq);
50static void unlink_linger(struct ceph_osd *osd,
51 struct ceph_osd_linger_request *lreq);
52
53#if 1
54static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
55{
56 bool wrlocked = true;
57
58 if (unlikely(down_read_trylock(sem))) {
59 wrlocked = false;
60 up_read(sem);
61 }
62
63 return wrlocked;
64}
65static inline void verify_osdc_locked(struct ceph_osd_client *osdc)
66{
67 WARN_ON(!rwsem_is_locked(&osdc->lock));
68}
69static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc)
70{
71 WARN_ON(!rwsem_is_wrlocked(&osdc->lock));
72}
73static inline void verify_osd_locked(struct ceph_osd *osd)
74{
75 struct ceph_osd_client *osdc = osd->o_osdc;
76
77 WARN_ON(!(mutex_is_locked(&osd->lock) &&
78 rwsem_is_locked(&osdc->lock)) &&
79 !rwsem_is_wrlocked(&osdc->lock));
80}
81static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq)
82{
83 WARN_ON(!mutex_is_locked(&lreq->lock));
84}
85#else
86static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { }
87static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { }
88static inline void verify_osd_locked(struct ceph_osd *osd) { }
89static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { }
90#endif
91
59/* 92/*
60 * calculate the mapping of a file extent onto an object, and fill out the 93 * calculate the mapping of a file extent onto an object, and fill out the
61 * request accordingly. shorten extent as necessary if it crosses an 94 * request accordingly. shorten extent as necessary if it crosses an
@@ -144,14 +177,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
144} 177}
145EXPORT_SYMBOL(osd_req_op_extent_osd_data); 178EXPORT_SYMBOL(osd_req_op_extent_osd_data);
146 179
147struct ceph_osd_data *
148osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
149 unsigned int which)
150{
151 return osd_req_op_data(osd_req, which, cls, response_data);
152}
153EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */
154
155void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req, 180void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
156 unsigned int which, struct page **pages, 181 unsigned int which, struct page **pages,
157 u64 length, u32 alignment, 182 u64 length, u32 alignment,
@@ -218,6 +243,8 @@ void osd_req_op_cls_request_data_pagelist(
218 243
219 osd_data = osd_req_op_data(osd_req, which, cls, request_data); 244 osd_data = osd_req_op_data(osd_req, which, cls, request_data);
220 ceph_osd_data_pagelist_init(osd_data, pagelist); 245 ceph_osd_data_pagelist_init(osd_data, pagelist);
246 osd_req->r_ops[which].cls.indata_len += pagelist->length;
247 osd_req->r_ops[which].indata_len += pagelist->length;
221} 248}
222EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); 249EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
223 250
@@ -230,6 +257,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
230 osd_data = osd_req_op_data(osd_req, which, cls, request_data); 257 osd_data = osd_req_op_data(osd_req, which, cls, request_data);
231 ceph_osd_data_pages_init(osd_data, pages, length, alignment, 258 ceph_osd_data_pages_init(osd_data, pages, length, alignment,
232 pages_from_pool, own_pages); 259 pages_from_pool, own_pages);
260 osd_req->r_ops[which].cls.indata_len += length;
261 osd_req->r_ops[which].indata_len += length;
233} 262}
234EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); 263EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
235 264
@@ -302,14 +331,76 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
302 case CEPH_OSD_OP_STAT: 331 case CEPH_OSD_OP_STAT:
303 ceph_osd_data_release(&op->raw_data_in); 332 ceph_osd_data_release(&op->raw_data_in);
304 break; 333 break;
334 case CEPH_OSD_OP_NOTIFY_ACK:
335 ceph_osd_data_release(&op->notify_ack.request_data);
336 break;
337 case CEPH_OSD_OP_NOTIFY:
338 ceph_osd_data_release(&op->notify.request_data);
339 ceph_osd_data_release(&op->notify.response_data);
340 break;
305 default: 341 default:
306 break; 342 break;
307 } 343 }
308} 344}
309 345
310/* 346/*
347 * Assumes @t is zero-initialized.
348 */
349static void target_init(struct ceph_osd_request_target *t)
350{
351 ceph_oid_init(&t->base_oid);
352 ceph_oloc_init(&t->base_oloc);
353 ceph_oid_init(&t->target_oid);
354 ceph_oloc_init(&t->target_oloc);
355
356 ceph_osds_init(&t->acting);
357 ceph_osds_init(&t->up);
358 t->size = -1;
359 t->min_size = -1;
360
361 t->osd = CEPH_HOMELESS_OSD;
362}
363
364static void target_copy(struct ceph_osd_request_target *dest,
365 const struct ceph_osd_request_target *src)
366{
367 ceph_oid_copy(&dest->base_oid, &src->base_oid);
368 ceph_oloc_copy(&dest->base_oloc, &src->base_oloc);
369 ceph_oid_copy(&dest->target_oid, &src->target_oid);
370 ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
371
372 dest->pgid = src->pgid; /* struct */
373 dest->pg_num = src->pg_num;
374 dest->pg_num_mask = src->pg_num_mask;
375 ceph_osds_copy(&dest->acting, &src->acting);
376 ceph_osds_copy(&dest->up, &src->up);
377 dest->size = src->size;
378 dest->min_size = src->min_size;
379 dest->sort_bitwise = src->sort_bitwise;
380
381 dest->flags = src->flags;
382 dest->paused = src->paused;
383
384 dest->osd = src->osd;
385}
386
387static void target_destroy(struct ceph_osd_request_target *t)
388{
389 ceph_oid_destroy(&t->base_oid);
390 ceph_oid_destroy(&t->target_oid);
391}
392
393/*
311 * requests 394 * requests
312 */ 395 */
396static void request_release_checks(struct ceph_osd_request *req)
397{
398 WARN_ON(!RB_EMPTY_NODE(&req->r_node));
399 WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
400 WARN_ON(!list_empty(&req->r_unsafe_item));
401 WARN_ON(req->r_osd);
402}
403
313static void ceph_osdc_release_request(struct kref *kref) 404static void ceph_osdc_release_request(struct kref *kref)
314{ 405{
315 struct ceph_osd_request *req = container_of(kref, 406 struct ceph_osd_request *req = container_of(kref,
@@ -318,24 +409,19 @@ static void ceph_osdc_release_request(struct kref *kref)
318 409
319 dout("%s %p (r_request %p r_reply %p)\n", __func__, req, 410 dout("%s %p (r_request %p r_reply %p)\n", __func__, req,
320 req->r_request, req->r_reply); 411 req->r_request, req->r_reply);
321 WARN_ON(!RB_EMPTY_NODE(&req->r_node)); 412 request_release_checks(req);
322 WARN_ON(!list_empty(&req->r_req_lru_item));
323 WARN_ON(!list_empty(&req->r_osd_item));
324 WARN_ON(!list_empty(&req->r_linger_item));
325 WARN_ON(!list_empty(&req->r_linger_osd_item));
326 WARN_ON(req->r_osd);
327 413
328 if (req->r_request) 414 if (req->r_request)
329 ceph_msg_put(req->r_request); 415 ceph_msg_put(req->r_request);
330 if (req->r_reply) { 416 if (req->r_reply)
331 ceph_msg_revoke_incoming(req->r_reply);
332 ceph_msg_put(req->r_reply); 417 ceph_msg_put(req->r_reply);
333 }
334 418
335 for (which = 0; which < req->r_num_ops; which++) 419 for (which = 0; which < req->r_num_ops; which++)
336 osd_req_op_data_release(req, which); 420 osd_req_op_data_release(req, which);
337 421
422 target_destroy(&req->r_t);
338 ceph_put_snap_context(req->r_snapc); 423 ceph_put_snap_context(req->r_snapc);
424
339 if (req->r_mempool) 425 if (req->r_mempool)
340 mempool_free(req, req->r_osdc->req_mempool); 426 mempool_free(req, req->r_osdc->req_mempool);
341 else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS) 427 else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
@@ -354,12 +440,66 @@ EXPORT_SYMBOL(ceph_osdc_get_request);
354 440
355void ceph_osdc_put_request(struct ceph_osd_request *req) 441void ceph_osdc_put_request(struct ceph_osd_request *req)
356{ 442{
357 dout("%s %p (was %d)\n", __func__, req, 443 if (req) {
358 atomic_read(&req->r_kref.refcount)); 444 dout("%s %p (was %d)\n", __func__, req,
359 kref_put(&req->r_kref, ceph_osdc_release_request); 445 atomic_read(&req->r_kref.refcount));
446 kref_put(&req->r_kref, ceph_osdc_release_request);
447 }
360} 448}
361EXPORT_SYMBOL(ceph_osdc_put_request); 449EXPORT_SYMBOL(ceph_osdc_put_request);
362 450
451static void request_init(struct ceph_osd_request *req)
452{
453 /* req only, each op is zeroed in _osd_req_op_init() */
454 memset(req, 0, sizeof(*req));
455
456 kref_init(&req->r_kref);
457 init_completion(&req->r_completion);
458 init_completion(&req->r_safe_completion);
459 RB_CLEAR_NODE(&req->r_node);
460 RB_CLEAR_NODE(&req->r_mc_node);
461 INIT_LIST_HEAD(&req->r_unsafe_item);
462
463 target_init(&req->r_t);
464}
465
466/*
467 * This is ugly, but it allows us to reuse linger registration and ping
468 * requests, keeping the structure of the code around send_linger{_ping}()
469 * reasonable. Setting up a min_nr=2 mempool for each linger request
470 * and dealing with copying ops (this blasts req only, watch op remains
471 * intact) isn't any better.
472 */
473static void request_reinit(struct ceph_osd_request *req)
474{
475 struct ceph_osd_client *osdc = req->r_osdc;
476 bool mempool = req->r_mempool;
477 unsigned int num_ops = req->r_num_ops;
478 u64 snapid = req->r_snapid;
479 struct ceph_snap_context *snapc = req->r_snapc;
480 bool linger = req->r_linger;
481 struct ceph_msg *request_msg = req->r_request;
482 struct ceph_msg *reply_msg = req->r_reply;
483
484 dout("%s req %p\n", __func__, req);
485 WARN_ON(atomic_read(&req->r_kref.refcount) != 1);
486 request_release_checks(req);
487
488 WARN_ON(atomic_read(&request_msg->kref.refcount) != 1);
489 WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1);
490 target_destroy(&req->r_t);
491
492 request_init(req);
493 req->r_osdc = osdc;
494 req->r_mempool = mempool;
495 req->r_num_ops = num_ops;
496 req->r_snapid = snapid;
497 req->r_snapc = snapc;
498 req->r_linger = linger;
499 req->r_request = request_msg;
500 req->r_reply = reply_msg;
501}
502
363struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 503struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
364 struct ceph_snap_context *snapc, 504 struct ceph_snap_context *snapc,
365 unsigned int num_ops, 505 unsigned int num_ops,
@@ -367,8 +507,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
367 gfp_t gfp_flags) 507 gfp_t gfp_flags)
368{ 508{
369 struct ceph_osd_request *req; 509 struct ceph_osd_request *req;
370 struct ceph_msg *msg;
371 size_t msg_size;
372 510
373 if (use_mempool) { 511 if (use_mempool) {
374 BUG_ON(num_ops > CEPH_OSD_SLAB_OPS); 512 BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
@@ -383,73 +521,65 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
383 if (unlikely(!req)) 521 if (unlikely(!req))
384 return NULL; 522 return NULL;
385 523
386 /* req only, each op is zeroed in _osd_req_op_init() */ 524 request_init(req);
387 memset(req, 0, sizeof(*req));
388
389 req->r_osdc = osdc; 525 req->r_osdc = osdc;
390 req->r_mempool = use_mempool; 526 req->r_mempool = use_mempool;
391 req->r_num_ops = num_ops; 527 req->r_num_ops = num_ops;
528 req->r_snapid = CEPH_NOSNAP;
529 req->r_snapc = ceph_get_snap_context(snapc);
392 530
393 kref_init(&req->r_kref); 531 dout("%s req %p\n", __func__, req);
394 init_completion(&req->r_completion); 532 return req;
395 init_completion(&req->r_safe_completion); 533}
396 RB_CLEAR_NODE(&req->r_node); 534EXPORT_SYMBOL(ceph_osdc_alloc_request);
397 INIT_LIST_HEAD(&req->r_unsafe_item);
398 INIT_LIST_HEAD(&req->r_linger_item);
399 INIT_LIST_HEAD(&req->r_linger_osd_item);
400 INIT_LIST_HEAD(&req->r_req_lru_item);
401 INIT_LIST_HEAD(&req->r_osd_item);
402
403 req->r_base_oloc.pool = -1;
404 req->r_target_oloc.pool = -1;
405 535
406 msg_size = OSD_OPREPLY_FRONT_LEN; 536int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
407 if (num_ops > CEPH_OSD_SLAB_OPS) { 537{
408 /* ceph_osd_op and rval */ 538 struct ceph_osd_client *osdc = req->r_osdc;
409 msg_size += (num_ops - CEPH_OSD_SLAB_OPS) * 539 struct ceph_msg *msg;
410 (sizeof(struct ceph_osd_op) + 4); 540 int msg_size;
411 }
412 541
413 /* create reply message */ 542 WARN_ON(ceph_oid_empty(&req->r_base_oid));
414 if (use_mempool)
415 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
416 else
417 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
418 gfp_flags, true);
419 if (!msg) {
420 ceph_osdc_put_request(req);
421 return NULL;
422 }
423 req->r_reply = msg;
424 543
544 /* create request message */
425 msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ 545 msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
426 msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ 546 msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
427 msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ 547 msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
428 msg_size += 1 + 8 + 4 + 4; /* pgid */ 548 msg_size += 1 + 8 + 4 + 4; /* pgid */
429 msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ 549 msg_size += 4 + req->r_base_oid.name_len; /* oid */
430 msg_size += 2 + num_ops * sizeof(struct ceph_osd_op); 550 msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
431 msg_size += 8; /* snapid */ 551 msg_size += 8; /* snapid */
432 msg_size += 8; /* snap_seq */ 552 msg_size += 8; /* snap_seq */
433 msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ 553 msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
434 msg_size += 4; /* retry_attempt */ 554 msg_size += 4; /* retry_attempt */
435 555
436 /* create request message; allow space for oid */ 556 if (req->r_mempool)
437 if (use_mempool)
438 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 557 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
439 else 558 else
440 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true); 559 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
441 if (!msg) { 560 if (!msg)
442 ceph_osdc_put_request(req); 561 return -ENOMEM;
443 return NULL;
444 }
445 562
446 memset(msg->front.iov_base, 0, msg->front.iov_len); 563 memset(msg->front.iov_base, 0, msg->front.iov_len);
447
448 req->r_request = msg; 564 req->r_request = msg;
449 565
450 return req; 566 /* create reply message */
567 msg_size = OSD_OPREPLY_FRONT_LEN;
568 msg_size += req->r_base_oid.name_len;
569 msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
570
571 if (req->r_mempool)
572 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
573 else
574 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
575 if (!msg)
576 return -ENOMEM;
577
578 req->r_reply = msg;
579
580 return 0;
451} 581}
452EXPORT_SYMBOL(ceph_osdc_alloc_request); 582EXPORT_SYMBOL(ceph_osdc_alloc_messages);
453 583
454static bool osd_req_opcode_valid(u16 opcode) 584static bool osd_req_opcode_valid(u16 opcode)
455{ 585{
@@ -587,8 +717,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
587 717
588 osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); 718 osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
589 719
590 op->cls.argc = 0; /* currently unused */
591
592 op->indata_len = payload_len; 720 op->indata_len = payload_len;
593} 721}
594EXPORT_SYMBOL(osd_req_op_cls_init); 722EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -627,21 +755,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
627} 755}
628EXPORT_SYMBOL(osd_req_op_xattr_init); 756EXPORT_SYMBOL(osd_req_op_xattr_init);
629 757
630void osd_req_op_watch_init(struct ceph_osd_request *osd_req, 758/*
631 unsigned int which, u16 opcode, 759 * @watch_opcode: CEPH_OSD_WATCH_OP_*
632 u64 cookie, u64 version, int flag) 760 */
761static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
762 u64 cookie, u8 watch_opcode)
633{ 763{
634 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, 764 struct ceph_osd_req_op *op;
635 opcode, 0);
636
637 BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
638 765
766 op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
639 op->watch.cookie = cookie; 767 op->watch.cookie = cookie;
640 op->watch.ver = version; 768 op->watch.op = watch_opcode;
641 if (opcode == CEPH_OSD_OP_WATCH && flag) 769 op->watch.gen = 0;
642 op->watch.flag = (u8)1;
643} 770}
644EXPORT_SYMBOL(osd_req_op_watch_init);
645 771
646void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, 772void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
647 unsigned int which, 773 unsigned int which,
@@ -686,16 +812,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
686 } 812 }
687} 813}
688 814
689static u64 osd_req_encode_op(struct ceph_osd_request *req, 815static u32 osd_req_encode_op(struct ceph_osd_op *dst,
690 struct ceph_osd_op *dst, unsigned int which) 816 const struct ceph_osd_req_op *src)
691{ 817{
692 struct ceph_osd_req_op *src;
693 struct ceph_osd_data *osd_data;
694 u64 request_data_len = 0;
695 u64 data_length;
696
697 BUG_ON(which >= req->r_num_ops);
698 src = &req->r_ops[which];
699 if (WARN_ON(!osd_req_opcode_valid(src->op))) { 818 if (WARN_ON(!osd_req_opcode_valid(src->op))) {
700 pr_err("unrecognized osd opcode %d\n", src->op); 819 pr_err("unrecognized osd opcode %d\n", src->op);
701 820
@@ -704,57 +823,36 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
704 823
705 switch (src->op) { 824 switch (src->op) {
706 case CEPH_OSD_OP_STAT: 825 case CEPH_OSD_OP_STAT:
707 osd_data = &src->raw_data_in;
708 ceph_osdc_msg_data_add(req->r_reply, osd_data);
709 break; 826 break;
710 case CEPH_OSD_OP_READ: 827 case CEPH_OSD_OP_READ:
711 case CEPH_OSD_OP_WRITE: 828 case CEPH_OSD_OP_WRITE:
712 case CEPH_OSD_OP_WRITEFULL: 829 case CEPH_OSD_OP_WRITEFULL:
713 case CEPH_OSD_OP_ZERO: 830 case CEPH_OSD_OP_ZERO:
714 case CEPH_OSD_OP_TRUNCATE: 831 case CEPH_OSD_OP_TRUNCATE:
715 if (src->op == CEPH_OSD_OP_WRITE ||
716 src->op == CEPH_OSD_OP_WRITEFULL)
717 request_data_len = src->extent.length;
718 dst->extent.offset = cpu_to_le64(src->extent.offset); 832 dst->extent.offset = cpu_to_le64(src->extent.offset);
719 dst->extent.length = cpu_to_le64(src->extent.length); 833 dst->extent.length = cpu_to_le64(src->extent.length);
720 dst->extent.truncate_size = 834 dst->extent.truncate_size =
721 cpu_to_le64(src->extent.truncate_size); 835 cpu_to_le64(src->extent.truncate_size);
722 dst->extent.truncate_seq = 836 dst->extent.truncate_seq =
723 cpu_to_le32(src->extent.truncate_seq); 837 cpu_to_le32(src->extent.truncate_seq);
724 osd_data = &src->extent.osd_data;
725 if (src->op == CEPH_OSD_OP_WRITE ||
726 src->op == CEPH_OSD_OP_WRITEFULL)
727 ceph_osdc_msg_data_add(req->r_request, osd_data);
728 else
729 ceph_osdc_msg_data_add(req->r_reply, osd_data);
730 break; 838 break;
731 case CEPH_OSD_OP_CALL: 839 case CEPH_OSD_OP_CALL:
732 dst->cls.class_len = src->cls.class_len; 840 dst->cls.class_len = src->cls.class_len;
733 dst->cls.method_len = src->cls.method_len; 841 dst->cls.method_len = src->cls.method_len;
734 osd_data = &src->cls.request_info; 842 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
735 ceph_osdc_msg_data_add(req->r_request, osd_data);
736 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
737 request_data_len = osd_data->pagelist->length;
738
739 osd_data = &src->cls.request_data;
740 data_length = ceph_osd_data_length(osd_data);
741 if (data_length) {
742 BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
743 dst->cls.indata_len = cpu_to_le32(data_length);
744 ceph_osdc_msg_data_add(req->r_request, osd_data);
745 src->indata_len += data_length;
746 request_data_len += data_length;
747 }
748 osd_data = &src->cls.response_data;
749 ceph_osdc_msg_data_add(req->r_reply, osd_data);
750 break; 843 break;
751 case CEPH_OSD_OP_STARTSYNC: 844 case CEPH_OSD_OP_STARTSYNC:
752 break; 845 break;
753 case CEPH_OSD_OP_NOTIFY_ACK:
754 case CEPH_OSD_OP_WATCH: 846 case CEPH_OSD_OP_WATCH:
755 dst->watch.cookie = cpu_to_le64(src->watch.cookie); 847 dst->watch.cookie = cpu_to_le64(src->watch.cookie);
756 dst->watch.ver = cpu_to_le64(src->watch.ver); 848 dst->watch.ver = cpu_to_le64(0);
757 dst->watch.flag = src->watch.flag; 849 dst->watch.op = src->watch.op;
850 dst->watch.gen = cpu_to_le32(src->watch.gen);
851 break;
852 case CEPH_OSD_OP_NOTIFY_ACK:
853 break;
854 case CEPH_OSD_OP_NOTIFY:
855 dst->notify.cookie = cpu_to_le64(src->notify.cookie);
758 break; 856 break;
759 case CEPH_OSD_OP_SETALLOCHINT: 857 case CEPH_OSD_OP_SETALLOCHINT:
760 dst->alloc_hint.expected_object_size = 858 dst->alloc_hint.expected_object_size =
@@ -768,9 +866,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
768 dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); 866 dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
769 dst->xattr.cmp_op = src->xattr.cmp_op; 867 dst->xattr.cmp_op = src->xattr.cmp_op;
770 dst->xattr.cmp_mode = src->xattr.cmp_mode; 868 dst->xattr.cmp_mode = src->xattr.cmp_mode;
771 osd_data = &src->xattr.osd_data;
772 ceph_osdc_msg_data_add(req->r_request, osd_data);
773 request_data_len = osd_data->pagelist->length;
774 break; 869 break;
775 case CEPH_OSD_OP_CREATE: 870 case CEPH_OSD_OP_CREATE:
776 case CEPH_OSD_OP_DELETE: 871 case CEPH_OSD_OP_DELETE:
@@ -787,7 +882,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
787 dst->flags = cpu_to_le32(src->flags); 882 dst->flags = cpu_to_le32(src->flags);
788 dst->payload_len = cpu_to_le32(src->indata_len); 883 dst->payload_len = cpu_to_le32(src->indata_len);
789 884
790 return request_data_len; 885 return src->indata_len;
791} 886}
792 887
793/* 888/*
@@ -824,17 +919,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
824 919
825 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, 920 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
826 GFP_NOFS); 921 GFP_NOFS);
827 if (!req) 922 if (!req) {
828 return ERR_PTR(-ENOMEM); 923 r = -ENOMEM;
829 924 goto fail;
830 req->r_flags = flags; 925 }
831 926
832 /* calculate max write size */ 927 /* calculate max write size */
833 r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen); 928 r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
834 if (r < 0) { 929 if (r)
835 ceph_osdc_put_request(req); 930 goto fail;
836 return ERR_PTR(r);
837 }
838 931
839 if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { 932 if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
840 osd_req_op_init(req, which, opcode, 0); 933 osd_req_op_init(req, which, opcode, 0);
@@ -854,194 +947,71 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
854 truncate_size, truncate_seq); 947 truncate_size, truncate_seq);
855 } 948 }
856 949
950 req->r_flags = flags;
857 req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); 951 req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
952 ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
858 953
859 snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name), 954 req->r_snapid = vino.snap;
860 "%llx.%08llx", vino.ino, objnum); 955 if (flags & CEPH_OSD_FLAG_WRITE)
861 req->r_base_oid.name_len = strlen(req->r_base_oid.name); 956 req->r_data_offset = off;
957
958 r = ceph_osdc_alloc_messages(req, GFP_NOFS);
959 if (r)
960 goto fail;
862 961
863 return req; 962 return req;
963
964fail:
965 ceph_osdc_put_request(req);
966 return ERR_PTR(r);
864} 967}
865EXPORT_SYMBOL(ceph_osdc_new_request); 968EXPORT_SYMBOL(ceph_osdc_new_request);
866 969
867/* 970/*
868 * We keep osd requests in an rbtree, sorted by ->r_tid. 971 * We keep osd requests in an rbtree, sorted by ->r_tid.
869 */ 972 */
870static void __insert_request(struct ceph_osd_client *osdc, 973DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node)
871 struct ceph_osd_request *new) 974DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node)
872{
873 struct rb_node **p = &osdc->requests.rb_node;
874 struct rb_node *parent = NULL;
875 struct ceph_osd_request *req = NULL;
876
877 while (*p) {
878 parent = *p;
879 req = rb_entry(parent, struct ceph_osd_request, r_node);
880 if (new->r_tid < req->r_tid)
881 p = &(*p)->rb_left;
882 else if (new->r_tid > req->r_tid)
883 p = &(*p)->rb_right;
884 else
885 BUG();
886 }
887
888 rb_link_node(&new->r_node, parent, p);
889 rb_insert_color(&new->r_node, &osdc->requests);
890}
891
892static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
893 u64 tid)
894{
895 struct ceph_osd_request *req;
896 struct rb_node *n = osdc->requests.rb_node;
897
898 while (n) {
899 req = rb_entry(n, struct ceph_osd_request, r_node);
900 if (tid < req->r_tid)
901 n = n->rb_left;
902 else if (tid > req->r_tid)
903 n = n->rb_right;
904 else
905 return req;
906 }
907 return NULL;
908}
909 975
910static struct ceph_osd_request * 976static bool osd_homeless(struct ceph_osd *osd)
911__lookup_request_ge(struct ceph_osd_client *osdc,
912 u64 tid)
913{ 977{
914 struct ceph_osd_request *req; 978 return osd->o_osd == CEPH_HOMELESS_OSD;
915 struct rb_node *n = osdc->requests.rb_node;
916
917 while (n) {
918 req = rb_entry(n, struct ceph_osd_request, r_node);
919 if (tid < req->r_tid) {
920 if (!n->rb_left)
921 return req;
922 n = n->rb_left;
923 } else if (tid > req->r_tid) {
924 n = n->rb_right;
925 } else {
926 return req;
927 }
928 }
929 return NULL;
930} 979}
931 980
932static void __kick_linger_request(struct ceph_osd_request *req) 981static bool osd_registered(struct ceph_osd *osd)
933{ 982{
934 struct ceph_osd_client *osdc = req->r_osdc; 983 verify_osdc_locked(osd->o_osdc);
935 struct ceph_osd *osd = req->r_osd;
936
937 /*
938 * Linger requests need to be resent with a new tid to avoid
939 * the dup op detection logic on the OSDs. Achieve this with
940 * a re-register dance instead of open-coding.
941 */
942 ceph_osdc_get_request(req);
943 if (!list_empty(&req->r_linger_item))
944 __unregister_linger_request(osdc, req);
945 else
946 __unregister_request(osdc, req);
947 __register_request(osdc, req);
948 ceph_osdc_put_request(req);
949
950 /*
951 * Unless request has been registered as both normal and
952 * lingering, __unregister{,_linger}_request clears r_osd.
953 * However, here we need to preserve r_osd to make sure we
954 * requeue on the same OSD.
955 */
956 WARN_ON(req->r_osd || !osd);
957 req->r_osd = osd;
958 984
959 dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid); 985 return !RB_EMPTY_NODE(&osd->o_node);
960 __enqueue_request(req);
961} 986}
962 987
963/* 988/*
964 * Resubmit requests pending on the given osd. 989 * Assumes @osd is zero-initialized.
965 */ 990 */
966static void __kick_osd_requests(struct ceph_osd_client *osdc, 991static void osd_init(struct ceph_osd *osd)
967 struct ceph_osd *osd)
968{ 992{
969 struct ceph_osd_request *req, *nreq; 993 atomic_set(&osd->o_ref, 1);
970 LIST_HEAD(resend); 994 RB_CLEAR_NODE(&osd->o_node);
971 LIST_HEAD(resend_linger); 995 osd->o_requests = RB_ROOT;
972 int err; 996 osd->o_linger_requests = RB_ROOT;
973 997 INIT_LIST_HEAD(&osd->o_osd_lru);
974 dout("%s osd%d\n", __func__, osd->o_osd); 998 INIT_LIST_HEAD(&osd->o_keepalive_item);
975 err = __reset_osd(osdc, osd); 999 osd->o_incarnation = 1;
976 if (err) 1000 mutex_init(&osd->lock);
977 return;
978
979 /*
980 * Build up a list of requests to resend by traversing the
981 * osd's list of requests. Requests for a given object are
982 * sent in tid order, and that is also the order they're
983 * kept on this list. Therefore all requests that are in
984 * flight will be found first, followed by all requests that
985 * have not yet been sent. And to resend requests while
986 * preserving this order we will want to put any sent
987 * requests back on the front of the osd client's unsent
988 * list.
989 *
990 * So we build a separate ordered list of already-sent
991 * requests for the affected osd and splice it onto the
992 * front of the osd client's unsent list. Once we've seen a
993 * request that has not yet been sent we're done. Those
994 * requests are already sitting right where they belong.
995 */
996 list_for_each_entry(req, &osd->o_requests, r_osd_item) {
997 if (!req->r_sent)
998 break;
999
1000 if (!req->r_linger) {
1001 dout("%s requeueing %p tid %llu\n", __func__, req,
1002 req->r_tid);
1003 list_move_tail(&req->r_req_lru_item, &resend);
1004 req->r_flags |= CEPH_OSD_FLAG_RETRY;
1005 } else {
1006 list_move_tail(&req->r_req_lru_item, &resend_linger);
1007 }
1008 }
1009 list_splice(&resend, &osdc->req_unsent);
1010
1011 /*
1012 * Both registered and not yet registered linger requests are
1013 * enqueued with a new tid on the same OSD. We add/move them
1014 * to req_unsent/o_requests at the end to keep things in tid
1015 * order.
1016 */
1017 list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
1018 r_linger_osd_item) {
1019 WARN_ON(!list_empty(&req->r_req_lru_item));
1020 __kick_linger_request(req);
1021 }
1022
1023 list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item)
1024 __kick_linger_request(req);
1025} 1001}
1026 1002
1027/* 1003static void osd_cleanup(struct ceph_osd *osd)
1028 * If the osd connection drops, we need to resubmit all requests.
1029 */
1030static void osd_reset(struct ceph_connection *con)
1031{ 1004{
1032 struct ceph_osd *osd = con->private; 1005 WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
1033 struct ceph_osd_client *osdc; 1006 WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
1034 1007 WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
1035 if (!osd) 1008 WARN_ON(!list_empty(&osd->o_osd_lru));
1036 return; 1009 WARN_ON(!list_empty(&osd->o_keepalive_item));
1037 dout("osd_reset osd%d\n", osd->o_osd); 1010
1038 osdc = osd->o_osdc; 1011 if (osd->o_auth.authorizer) {
1039 down_read(&osdc->map_sem); 1012 WARN_ON(osd_homeless(osd));
1040 mutex_lock(&osdc->request_mutex); 1013 ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
1041 __kick_osd_requests(osdc, osd); 1014 }
1042 __send_queued(osdc);
1043 mutex_unlock(&osdc->request_mutex);
1044 up_read(&osdc->map_sem);
1045} 1015}
1046 1016
1047/* 1017/*
@@ -1051,22 +1021,15 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
1051{ 1021{
1052 struct ceph_osd *osd; 1022 struct ceph_osd *osd;
1053 1023
1054 osd = kzalloc(sizeof(*osd), GFP_NOFS); 1024 WARN_ON(onum == CEPH_HOMELESS_OSD);
1055 if (!osd)
1056 return NULL;
1057 1025
1058 atomic_set(&osd->o_ref, 1); 1026 osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL);
1027 osd_init(osd);
1059 osd->o_osdc = osdc; 1028 osd->o_osdc = osdc;
1060 osd->o_osd = onum; 1029 osd->o_osd = onum;
1061 RB_CLEAR_NODE(&osd->o_node);
1062 INIT_LIST_HEAD(&osd->o_requests);
1063 INIT_LIST_HEAD(&osd->o_linger_requests);
1064 INIT_LIST_HEAD(&osd->o_osd_lru);
1065 osd->o_incarnation = 1;
1066 1030
1067 ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); 1031 ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
1068 1032
1069 INIT_LIST_HEAD(&osd->o_keepalive_item);
1070 return osd; 1033 return osd;
1071} 1034}
1072 1035
@@ -1087,114 +1050,115 @@ static void put_osd(struct ceph_osd *osd)
1087 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), 1050 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
1088 atomic_read(&osd->o_ref) - 1); 1051 atomic_read(&osd->o_ref) - 1);
1089 if (atomic_dec_and_test(&osd->o_ref)) { 1052 if (atomic_dec_and_test(&osd->o_ref)) {
1090 if (osd->o_auth.authorizer) 1053 osd_cleanup(osd);
1091 ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
1092 kfree(osd); 1054 kfree(osd);
1093 } 1055 }
1094} 1056}
1095 1057
1096/* 1058DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node)
1097 * remove an osd from our map
1098 */
1099static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
1100{
1101 dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
1102 WARN_ON(!list_empty(&osd->o_requests));
1103 WARN_ON(!list_empty(&osd->o_linger_requests));
1104 1059
1105 list_del_init(&osd->o_osd_lru); 1060static void __move_osd_to_lru(struct ceph_osd *osd)
1106 rb_erase(&osd->o_node, &osdc->osds);
1107 RB_CLEAR_NODE(&osd->o_node);
1108}
1109
1110static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
1111{ 1061{
1112 dout("%s %p osd%d\n", __func__, osd, osd->o_osd); 1062 struct ceph_osd_client *osdc = osd->o_osdc;
1113
1114 if (!RB_EMPTY_NODE(&osd->o_node)) {
1115 ceph_con_close(&osd->o_con);
1116 __remove_osd(osdc, osd);
1117 put_osd(osd);
1118 }
1119}
1120
1121static void remove_all_osds(struct ceph_osd_client *osdc)
1122{
1123 dout("%s %p\n", __func__, osdc);
1124 mutex_lock(&osdc->request_mutex);
1125 while (!RB_EMPTY_ROOT(&osdc->osds)) {
1126 struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
1127 struct ceph_osd, o_node);
1128 remove_osd(osdc, osd);
1129 }
1130 mutex_unlock(&osdc->request_mutex);
1131}
1132 1063
1133static void __move_osd_to_lru(struct ceph_osd_client *osdc, 1064 dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
1134 struct ceph_osd *osd)
1135{
1136 dout("%s %p\n", __func__, osd);
1137 BUG_ON(!list_empty(&osd->o_osd_lru)); 1065 BUG_ON(!list_empty(&osd->o_osd_lru));
1138 1066
1067 spin_lock(&osdc->osd_lru_lock);
1139 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); 1068 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
1069 spin_unlock(&osdc->osd_lru_lock);
1070
1140 osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl; 1071 osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
1141} 1072}
1142 1073
1143static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc, 1074static void maybe_move_osd_to_lru(struct ceph_osd *osd)
1144 struct ceph_osd *osd)
1145{ 1075{
1146 dout("%s %p\n", __func__, osd); 1076 if (RB_EMPTY_ROOT(&osd->o_requests) &&
1147 1077 RB_EMPTY_ROOT(&osd->o_linger_requests))
1148 if (list_empty(&osd->o_requests) && 1078 __move_osd_to_lru(osd);
1149 list_empty(&osd->o_linger_requests))
1150 __move_osd_to_lru(osdc, osd);
1151} 1079}
1152 1080
1153static void __remove_osd_from_lru(struct ceph_osd *osd) 1081static void __remove_osd_from_lru(struct ceph_osd *osd)
1154{ 1082{
1155 dout("__remove_osd_from_lru %p\n", osd); 1083 struct ceph_osd_client *osdc = osd->o_osdc;
1084
1085 dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
1086
1087 spin_lock(&osdc->osd_lru_lock);
1156 if (!list_empty(&osd->o_osd_lru)) 1088 if (!list_empty(&osd->o_osd_lru))
1157 list_del_init(&osd->o_osd_lru); 1089 list_del_init(&osd->o_osd_lru);
1090 spin_unlock(&osdc->osd_lru_lock);
1158} 1091}
1159 1092
1160static void remove_old_osds(struct ceph_osd_client *osdc) 1093/*
1094 * Close the connection and assign any leftover requests to the
1095 * homeless session.
1096 */
1097static void close_osd(struct ceph_osd *osd)
1161{ 1098{
1162 struct ceph_osd *osd, *nosd; 1099 struct ceph_osd_client *osdc = osd->o_osdc;
1100 struct rb_node *n;
1163 1101
1164 dout("__remove_old_osds %p\n", osdc); 1102 verify_osdc_wrlocked(osdc);
1165 mutex_lock(&osdc->request_mutex); 1103 dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
1166 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { 1104
1167 if (time_before(jiffies, osd->lru_ttl)) 1105 ceph_con_close(&osd->o_con);
1168 break; 1106
1169 remove_osd(osdc, osd); 1107 for (n = rb_first(&osd->o_requests); n; ) {
1108 struct ceph_osd_request *req =
1109 rb_entry(n, struct ceph_osd_request, r_node);
1110
1111 n = rb_next(n); /* unlink_request() */
1112
1113 dout(" reassigning req %p tid %llu\n", req, req->r_tid);
1114 unlink_request(osd, req);
1115 link_request(&osdc->homeless_osd, req);
1116 }
1117 for (n = rb_first(&osd->o_linger_requests); n; ) {
1118 struct ceph_osd_linger_request *lreq =
1119 rb_entry(n, struct ceph_osd_linger_request, node);
1120
1121 n = rb_next(n); /* unlink_linger() */
1122
1123 dout(" reassigning lreq %p linger_id %llu\n", lreq,
1124 lreq->linger_id);
1125 unlink_linger(osd, lreq);
1126 link_linger(&osdc->homeless_osd, lreq);
1170 } 1127 }
1171 mutex_unlock(&osdc->request_mutex); 1128
1129 __remove_osd_from_lru(osd);
1130 erase_osd(&osdc->osds, osd);
1131 put_osd(osd);
1172} 1132}
1173 1133
1174/* 1134/*
1175 * reset osd connect 1135 * reset osd connect
1176 */ 1136 */
1177static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) 1137static int reopen_osd(struct ceph_osd *osd)
1178{ 1138{
1179 struct ceph_entity_addr *peer_addr; 1139 struct ceph_entity_addr *peer_addr;
1180 1140
1181 dout("__reset_osd %p osd%d\n", osd, osd->o_osd); 1141 dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
1182 if (list_empty(&osd->o_requests) && 1142
1183 list_empty(&osd->o_linger_requests)) { 1143 if (RB_EMPTY_ROOT(&osd->o_requests) &&
1184 remove_osd(osdc, osd); 1144 RB_EMPTY_ROOT(&osd->o_linger_requests)) {
1145 close_osd(osd);
1185 return -ENODEV; 1146 return -ENODEV;
1186 } 1147 }
1187 1148
1188 peer_addr = &osdc->osdmap->osd_addr[osd->o_osd]; 1149 peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd];
1189 if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && 1150 if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
1190 !ceph_con_opened(&osd->o_con)) { 1151 !ceph_con_opened(&osd->o_con)) {
1191 struct ceph_osd_request *req; 1152 struct rb_node *n;
1192 1153
1193 dout("osd addr hasn't changed and connection never opened, " 1154 dout("osd addr hasn't changed and connection never opened, "
1194 "letting msgr retry\n"); 1155 "letting msgr retry\n");
1195 /* touch each r_stamp for handle_timeout()'s benfit */ 1156 /* touch each r_stamp for handle_timeout()'s benfit */
1196 list_for_each_entry(req, &osd->o_requests, r_osd_item) 1157 for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
1158 struct ceph_osd_request *req =
1159 rb_entry(n, struct ceph_osd_request, r_node);
1197 req->r_stamp = jiffies; 1160 req->r_stamp = jiffies;
1161 }
1198 1162
1199 return -EAGAIN; 1163 return -EAGAIN;
1200 } 1164 }
@@ -1206,455 +1170,1370 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
1206 return 0; 1170 return 0;
1207} 1171}
1208 1172
1209static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) 1173static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o,
1174 bool wrlocked)
1210{ 1175{
1211 struct rb_node **p = &osdc->osds.rb_node; 1176 struct ceph_osd *osd;
1212 struct rb_node *parent = NULL;
1213 struct ceph_osd *osd = NULL;
1214 1177
1215 dout("__insert_osd %p osd%d\n", new, new->o_osd); 1178 if (wrlocked)
1216 while (*p) { 1179 verify_osdc_wrlocked(osdc);
1217 parent = *p; 1180 else
1218 osd = rb_entry(parent, struct ceph_osd, o_node); 1181 verify_osdc_locked(osdc);
1219 if (new->o_osd < osd->o_osd) 1182
1220 p = &(*p)->rb_left; 1183 if (o != CEPH_HOMELESS_OSD)
1221 else if (new->o_osd > osd->o_osd) 1184 osd = lookup_osd(&osdc->osds, o);
1222 p = &(*p)->rb_right; 1185 else
1223 else 1186 osd = &osdc->homeless_osd;
1224 BUG(); 1187 if (!osd) {
1188 if (!wrlocked)
1189 return ERR_PTR(-EAGAIN);
1190
1191 osd = create_osd(osdc, o);
1192 insert_osd(&osdc->osds, osd);
1193 ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
1194 &osdc->osdmap->osd_addr[osd->o_osd]);
1225 } 1195 }
1226 1196
1227 rb_link_node(&new->o_node, parent, p); 1197 dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd);
1228 rb_insert_color(&new->o_node, &osdc->osds); 1198 return osd;
1229} 1199}
1230 1200
1231static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) 1201/*
1202 * Create request <-> OSD session relation.
1203 *
1204 * @req has to be assigned a tid, @osd may be homeless.
1205 */
1206static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
1232{ 1207{
1233 struct ceph_osd *osd; 1208 verify_osd_locked(osd);
1234 struct rb_node *n = osdc->osds.rb_node; 1209 WARN_ON(!req->r_tid || req->r_osd);
1235 1210 dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
1236 while (n) { 1211 req, req->r_tid);
1237 osd = rb_entry(n, struct ceph_osd, o_node); 1212
1238 if (o < osd->o_osd) 1213 if (!osd_homeless(osd))
1239 n = n->rb_left; 1214 __remove_osd_from_lru(osd);
1240 else if (o > osd->o_osd) 1215 else
1241 n = n->rb_right; 1216 atomic_inc(&osd->o_osdc->num_homeless);
1242 else 1217
1243 return osd; 1218 get_osd(osd);
1244 } 1219 insert_request(&osd->o_requests, req);
1245 return NULL; 1220 req->r_osd = osd;
1246} 1221}
1247 1222
1248static void __schedule_osd_timeout(struct ceph_osd_client *osdc) 1223static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
1249{ 1224{
1250 schedule_delayed_work(&osdc->timeout_work, 1225 verify_osd_locked(osd);
1251 osdc->client->options->osd_keepalive_timeout); 1226 WARN_ON(req->r_osd != osd);
1227 dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd,
1228 req, req->r_tid);
1229
1230 req->r_osd = NULL;
1231 erase_request(&osd->o_requests, req);
1232 put_osd(osd);
1233
1234 if (!osd_homeless(osd))
1235 maybe_move_osd_to_lru(osd);
1236 else
1237 atomic_dec(&osd->o_osdc->num_homeless);
1252} 1238}
1253 1239
1254static void __cancel_osd_timeout(struct ceph_osd_client *osdc) 1240static bool __pool_full(struct ceph_pg_pool_info *pi)
1255{ 1241{
1256 cancel_delayed_work(&osdc->timeout_work); 1242 return pi->flags & CEPH_POOL_FLAG_FULL;
1257} 1243}
1258 1244
1259/* 1245static bool have_pool_full(struct ceph_osd_client *osdc)
1260 * Register request, assign tid. If this is the first request, set up
1261 * the timeout event.
1262 */
1263static void __register_request(struct ceph_osd_client *osdc,
1264 struct ceph_osd_request *req)
1265{ 1246{
1266 req->r_tid = ++osdc->last_tid; 1247 struct rb_node *n;
1267 req->r_request->hdr.tid = cpu_to_le64(req->r_tid); 1248
1268 dout("__register_request %p tid %lld\n", req, req->r_tid); 1249 for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
1269 __insert_request(osdc, req); 1250 struct ceph_pg_pool_info *pi =
1270 ceph_osdc_get_request(req); 1251 rb_entry(n, struct ceph_pg_pool_info, node);
1271 osdc->num_requests++; 1252
1272 if (osdc->num_requests == 1) { 1253 if (__pool_full(pi))
1273 dout(" first request, scheduling timeout\n"); 1254 return true;
1274 __schedule_osd_timeout(osdc);
1275 } 1255 }
1256
1257 return false;
1258}
1259
1260static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id)
1261{
1262 struct ceph_pg_pool_info *pi;
1263
1264 pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
1265 if (!pi)
1266 return false;
1267
1268 return __pool_full(pi);
1276} 1269}
1277 1270
1278/* 1271/*
1279 * called under osdc->request_mutex 1272 * Returns whether a request should be blocked from being sent
1273 * based on the current osdmap and osd_client settings.
1280 */ 1274 */
1281static void __unregister_request(struct ceph_osd_client *osdc, 1275static bool target_should_be_paused(struct ceph_osd_client *osdc,
1282 struct ceph_osd_request *req) 1276 const struct ceph_osd_request_target *t,
1277 struct ceph_pg_pool_info *pi)
1283{ 1278{
1284 if (RB_EMPTY_NODE(&req->r_node)) { 1279 bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
1285 dout("__unregister_request %p tid %lld not registered\n", 1280 bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
1286 req, req->r_tid); 1281 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
1287 return; 1282 __pool_full(pi);
1283
1284 WARN_ON(pi->id != t->base_oloc.pool);
1285 return (t->flags & CEPH_OSD_FLAG_READ && pauserd) ||
1286 (t->flags & CEPH_OSD_FLAG_WRITE && pausewr);
1287}
1288
1289enum calc_target_result {
1290 CALC_TARGET_NO_ACTION = 0,
1291 CALC_TARGET_NEED_RESEND,
1292 CALC_TARGET_POOL_DNE,
1293};
1294
1295static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
1296 struct ceph_osd_request_target *t,
1297 u32 *last_force_resend,
1298 bool any_change)
1299{
1300 struct ceph_pg_pool_info *pi;
1301 struct ceph_pg pgid, last_pgid;
1302 struct ceph_osds up, acting;
1303 bool force_resend = false;
1304 bool need_check_tiering = false;
1305 bool need_resend = false;
1306 bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap,
1307 CEPH_OSDMAP_SORTBITWISE);
1308 enum calc_target_result ct_res;
1309 int ret;
1310
1311 pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
1312 if (!pi) {
1313 t->osd = CEPH_HOMELESS_OSD;
1314 ct_res = CALC_TARGET_POOL_DNE;
1315 goto out;
1288 } 1316 }
1289 1317
1290 dout("__unregister_request %p tid %lld\n", req, req->r_tid); 1318 if (osdc->osdmap->epoch == pi->last_force_request_resend) {
1291 rb_erase(&req->r_node, &osdc->requests); 1319 if (last_force_resend &&
1292 RB_CLEAR_NODE(&req->r_node); 1320 *last_force_resend < pi->last_force_request_resend) {
1293 osdc->num_requests--; 1321 *last_force_resend = pi->last_force_request_resend;
1322 force_resend = true;
1323 } else if (!last_force_resend) {
1324 force_resend = true;
1325 }
1326 }
1327 if (ceph_oid_empty(&t->target_oid) || force_resend) {
1328 ceph_oid_copy(&t->target_oid, &t->base_oid);
1329 need_check_tiering = true;
1330 }
1331 if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
1332 ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
1333 need_check_tiering = true;
1334 }
1294 1335
1295 if (req->r_osd) { 1336 if (need_check_tiering &&
1296 /* make sure the original request isn't in flight. */ 1337 (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
1297 ceph_msg_revoke(req->r_request); 1338 if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
1339 t->target_oloc.pool = pi->read_tier;
1340 if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
1341 t->target_oloc.pool = pi->write_tier;
1342 }
1298 1343
1299 list_del_init(&req->r_osd_item); 1344 ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid,
1300 maybe_move_osd_to_lru(osdc, req->r_osd); 1345 &t->target_oloc, &pgid);
1301 if (list_empty(&req->r_linger_osd_item)) 1346 if (ret) {
1302 req->r_osd = NULL; 1347 WARN_ON(ret != -ENOENT);
1348 t->osd = CEPH_HOMELESS_OSD;
1349 ct_res = CALC_TARGET_POOL_DNE;
1350 goto out;
1351 }
1352 last_pgid.pool = pgid.pool;
1353 last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
1354
1355 ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting);
1356 if (any_change &&
1357 ceph_is_new_interval(&t->acting,
1358 &acting,
1359 &t->up,
1360 &up,
1361 t->size,
1362 pi->size,
1363 t->min_size,
1364 pi->min_size,
1365 t->pg_num,
1366 pi->pg_num,
1367 t->sort_bitwise,
1368 sort_bitwise,
1369 &last_pgid))
1370 force_resend = true;
1371
1372 if (t->paused && !target_should_be_paused(osdc, t, pi)) {
1373 t->paused = false;
1374 need_resend = true;
1303 } 1375 }
1304 1376
1305 list_del_init(&req->r_req_lru_item); 1377 if (ceph_pg_compare(&t->pgid, &pgid) ||
1306 ceph_osdc_put_request(req); 1378 ceph_osds_changed(&t->acting, &acting, any_change) ||
1379 force_resend) {
1380 t->pgid = pgid; /* struct */
1381 ceph_osds_copy(&t->acting, &acting);
1382 ceph_osds_copy(&t->up, &up);
1383 t->size = pi->size;
1384 t->min_size = pi->min_size;
1385 t->pg_num = pi->pg_num;
1386 t->pg_num_mask = pi->pg_num_mask;
1387 t->sort_bitwise = sort_bitwise;
1388
1389 t->osd = acting.primary;
1390 need_resend = true;
1391 }
1392
1393 ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION;
1394out:
1395 dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
1396 return ct_res;
1397}
1398
1399static void setup_request_data(struct ceph_osd_request *req,
1400 struct ceph_msg *msg)
1401{
1402 u32 data_len = 0;
1403 int i;
1404
1405 if (!list_empty(&msg->data))
1406 return;
1407
1408 WARN_ON(msg->data_length);
1409 for (i = 0; i < req->r_num_ops; i++) {
1410 struct ceph_osd_req_op *op = &req->r_ops[i];
1411
1412 switch (op->op) {
1413 /* request */
1414 case CEPH_OSD_OP_WRITE:
1415 case CEPH_OSD_OP_WRITEFULL:
1416 WARN_ON(op->indata_len != op->extent.length);
1417 ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
1418 break;
1419 case CEPH_OSD_OP_SETXATTR:
1420 case CEPH_OSD_OP_CMPXATTR:
1421 WARN_ON(op->indata_len != op->xattr.name_len +
1422 op->xattr.value_len);
1423 ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
1424 break;
1425 case CEPH_OSD_OP_NOTIFY_ACK:
1426 ceph_osdc_msg_data_add(msg,
1427 &op->notify_ack.request_data);
1428 break;
1429
1430 /* reply */
1431 case CEPH_OSD_OP_STAT:
1432 ceph_osdc_msg_data_add(req->r_reply,
1433 &op->raw_data_in);
1434 break;
1435 case CEPH_OSD_OP_READ:
1436 ceph_osdc_msg_data_add(req->r_reply,
1437 &op->extent.osd_data);
1438 break;
1439
1440 /* both */
1441 case CEPH_OSD_OP_CALL:
1442 WARN_ON(op->indata_len != op->cls.class_len +
1443 op->cls.method_len +
1444 op->cls.indata_len);
1445 ceph_osdc_msg_data_add(msg, &op->cls.request_info);
1446 /* optional, can be NONE */
1447 ceph_osdc_msg_data_add(msg, &op->cls.request_data);
1448 /* optional, can be NONE */
1449 ceph_osdc_msg_data_add(req->r_reply,
1450 &op->cls.response_data);
1451 break;
1452 case CEPH_OSD_OP_NOTIFY:
1453 ceph_osdc_msg_data_add(msg,
1454 &op->notify.request_data);
1455 ceph_osdc_msg_data_add(req->r_reply,
1456 &op->notify.response_data);
1457 break;
1458 }
1459
1460 data_len += op->indata_len;
1461 }
1462
1463 WARN_ON(data_len != msg->data_length);
1464}
1465
1466static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
1467{
1468 void *p = msg->front.iov_base;
1469 void *const end = p + msg->front_alloc_len;
1470 u32 data_len = 0;
1471 int i;
1472
1473 if (req->r_flags & CEPH_OSD_FLAG_WRITE) {
1474 /* snapshots aren't writeable */
1475 WARN_ON(req->r_snapid != CEPH_NOSNAP);
1476 } else {
1477 WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec ||
1478 req->r_data_offset || req->r_snapc);
1479 }
1480
1481 setup_request_data(req, msg);
1482
1483 ceph_encode_32(&p, 1); /* client_inc, always 1 */
1484 ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
1485 ceph_encode_32(&p, req->r_flags);
1486 ceph_encode_timespec(p, &req->r_mtime);
1487 p += sizeof(struct ceph_timespec);
1488 /* aka reassert_version */
1489 memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version));
1490 p += sizeof(req->r_replay_version);
1491
1492 /* oloc */
1493 ceph_encode_8(&p, 4);
1494 ceph_encode_8(&p, 4);
1495 ceph_encode_32(&p, 8 + 4 + 4);
1496 ceph_encode_64(&p, req->r_t.target_oloc.pool);
1497 ceph_encode_32(&p, -1); /* preferred */
1498 ceph_encode_32(&p, 0); /* key len */
1499
1500 /* pgid */
1501 ceph_encode_8(&p, 1);
1502 ceph_encode_64(&p, req->r_t.pgid.pool);
1503 ceph_encode_32(&p, req->r_t.pgid.seed);
1504 ceph_encode_32(&p, -1); /* preferred */
1505
1506 /* oid */
1507 ceph_encode_32(&p, req->r_t.target_oid.name_len);
1508 memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
1509 p += req->r_t.target_oid.name_len;
1307 1510
1308 if (osdc->num_requests == 0) { 1511 /* ops, can imply data */
1309 dout(" no requests, canceling timeout\n"); 1512 ceph_encode_16(&p, req->r_num_ops);
1310 __cancel_osd_timeout(osdc); 1513 for (i = 0; i < req->r_num_ops; i++) {
1514 data_len += osd_req_encode_op(p, &req->r_ops[i]);
1515 p += sizeof(struct ceph_osd_op);
1311 } 1516 }
1517
1518 ceph_encode_64(&p, req->r_snapid); /* snapid */
1519 if (req->r_snapc) {
1520 ceph_encode_64(&p, req->r_snapc->seq);
1521 ceph_encode_32(&p, req->r_snapc->num_snaps);
1522 for (i = 0; i < req->r_snapc->num_snaps; i++)
1523 ceph_encode_64(&p, req->r_snapc->snaps[i]);
1524 } else {
1525 ceph_encode_64(&p, 0); /* snap_seq */
1526 ceph_encode_32(&p, 0); /* snaps len */
1527 }
1528
1529 ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
1530
1531 BUG_ON(p > end);
1532 msg->front.iov_len = p - msg->front.iov_base;
1533 msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
1534 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1535 msg->hdr.data_len = cpu_to_le32(data_len);
1536 /*
1537 * The header "data_off" is a hint to the receiver allowing it
1538 * to align received data into its buffers such that there's no
1539 * need to re-copy it before writing it to disk (direct I/O).
1540 */
1541 msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
1542
1543 dout("%s req %p oid %*pE oid_len %d front %zu data %u\n", __func__,
1544 req, req->r_t.target_oid.name_len, req->r_t.target_oid.name,
1545 req->r_t.target_oid.name_len, msg->front.iov_len, data_len);
1312} 1546}
1313 1547
1314/* 1548/*
1315 * Cancel a previously queued request message 1549 * @req has to be assigned a tid and registered.
1316 */ 1550 */
1317static void __cancel_request(struct ceph_osd_request *req) 1551static void send_request(struct ceph_osd_request *req)
1318{ 1552{
1319 if (req->r_sent && req->r_osd) { 1553 struct ceph_osd *osd = req->r_osd;
1554
1555 verify_osd_locked(osd);
1556 WARN_ON(osd->o_osd != req->r_t.osd);
1557
1558 /*
1559 * We may have a previously queued request message hanging
1560 * around. Cancel it to avoid corrupting the msgr.
1561 */
1562 if (req->r_sent)
1320 ceph_msg_revoke(req->r_request); 1563 ceph_msg_revoke(req->r_request);
1321 req->r_sent = 0; 1564
1565 req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR;
1566 if (req->r_attempts)
1567 req->r_flags |= CEPH_OSD_FLAG_RETRY;
1568 else
1569 WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
1570
1571 encode_request(req, req->r_request);
1572
1573 dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n",
1574 __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
1575 req->r_t.osd, req->r_flags, req->r_attempts);
1576
1577 req->r_t.paused = false;
1578 req->r_stamp = jiffies;
1579 req->r_attempts++;
1580
1581 req->r_sent = osd->o_incarnation;
1582 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
1583 ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request));
1584}
1585
1586static void maybe_request_map(struct ceph_osd_client *osdc)
1587{
1588 bool continuous = false;
1589
1590 verify_osdc_locked(osdc);
1591 WARN_ON(!osdc->osdmap->epoch);
1592
1593 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
1594 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
1595 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
1596 dout("%s osdc %p continuous\n", __func__, osdc);
1597 continuous = true;
1598 } else {
1599 dout("%s osdc %p onetime\n", __func__, osdc);
1322 } 1600 }
1601
1602 if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
1603 osdc->osdmap->epoch + 1, continuous))
1604 ceph_monc_renew_subs(&osdc->client->monc);
1323} 1605}
1324 1606
1325static void __register_linger_request(struct ceph_osd_client *osdc, 1607static void send_map_check(struct ceph_osd_request *req);
1326 struct ceph_osd_request *req) 1608
1609static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
1327{ 1610{
1328 dout("%s %p tid %llu\n", __func__, req, req->r_tid); 1611 struct ceph_osd_client *osdc = req->r_osdc;
1329 WARN_ON(!req->r_linger); 1612 struct ceph_osd *osd;
1613 enum calc_target_result ct_res;
1614 bool need_send = false;
1615 bool promoted = false;
1616
1617 WARN_ON(req->r_tid || req->r_got_reply);
1618 dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
1619
1620again:
1621 ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false);
1622 if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
1623 goto promote;
1624
1625 osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked);
1626 if (IS_ERR(osd)) {
1627 WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked);
1628 goto promote;
1629 }
1630
1631 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
1632 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) {
1633 dout("req %p pausewr\n", req);
1634 req->r_t.paused = true;
1635 maybe_request_map(osdc);
1636 } else if ((req->r_flags & CEPH_OSD_FLAG_READ) &&
1637 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
1638 dout("req %p pauserd\n", req);
1639 req->r_t.paused = true;
1640 maybe_request_map(osdc);
1641 } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) &&
1642 !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY |
1643 CEPH_OSD_FLAG_FULL_FORCE)) &&
1644 (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
1645 pool_full(osdc, req->r_t.base_oloc.pool))) {
1646 dout("req %p full/pool_full\n", req);
1647 pr_warn_ratelimited("FULL or reached pool quota\n");
1648 req->r_t.paused = true;
1649 maybe_request_map(osdc);
1650 } else if (!osd_homeless(osd)) {
1651 need_send = true;
1652 } else {
1653 maybe_request_map(osdc);
1654 }
1655
1656 mutex_lock(&osd->lock);
1657 /*
1658 * Assign the tid atomically with send_request() to protect
1659 * multiple writes to the same object from racing with each
1660 * other, resulting in out of order ops on the OSDs.
1661 */
1662 req->r_tid = atomic64_inc_return(&osdc->last_tid);
1663 link_request(osd, req);
1664 if (need_send)
1665 send_request(req);
1666 mutex_unlock(&osd->lock);
1330 1667
1668 if (ct_res == CALC_TARGET_POOL_DNE)
1669 send_map_check(req);
1670
1671 if (promoted)
1672 downgrade_write(&osdc->lock);
1673 return;
1674
1675promote:
1676 up_read(&osdc->lock);
1677 down_write(&osdc->lock);
1678 wrlocked = true;
1679 promoted = true;
1680 goto again;
1681}
1682
1683static void account_request(struct ceph_osd_request *req)
1684{
1685 unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
1686
1687 if (req->r_flags & CEPH_OSD_FLAG_READ) {
1688 WARN_ON(req->r_flags & mask);
1689 req->r_flags |= CEPH_OSD_FLAG_ACK;
1690 } else if (req->r_flags & CEPH_OSD_FLAG_WRITE)
1691 WARN_ON(!(req->r_flags & mask));
1692 else
1693 WARN_ON(1);
1694
1695 WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
1696 atomic_inc(&req->r_osdc->num_requests);
1697}
1698
1699static void submit_request(struct ceph_osd_request *req, bool wrlocked)
1700{
1331 ceph_osdc_get_request(req); 1701 ceph_osdc_get_request(req);
1332 list_add_tail(&req->r_linger_item, &osdc->req_linger); 1702 account_request(req);
1333 if (req->r_osd) 1703 __submit_request(req, wrlocked);
1334 list_add_tail(&req->r_linger_osd_item,
1335 &req->r_osd->o_linger_requests);
1336} 1704}
1337 1705
1338static void __unregister_linger_request(struct ceph_osd_client *osdc, 1706static void __finish_request(struct ceph_osd_request *req)
1339 struct ceph_osd_request *req)
1340{ 1707{
1341 WARN_ON(!req->r_linger); 1708 struct ceph_osd_client *osdc = req->r_osdc;
1709 struct ceph_osd *osd = req->r_osd;
1342 1710
1343 if (list_empty(&req->r_linger_item)) { 1711 verify_osd_locked(osd);
1344 dout("%s %p tid %llu not registered\n", __func__, req, 1712 dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
1345 req->r_tid); 1713
1714 WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
1715 unlink_request(osd, req);
1716 atomic_dec(&osdc->num_requests);
1717
1718 /*
1719 * If an OSD has failed or returned and a request has been sent
1720 * twice, it's possible to get a reply and end up here while the
1721 * request message is queued for delivery. We will ignore the
1722 * reply, so not a big deal, but better to try and catch it.
1723 */
1724 ceph_msg_revoke(req->r_request);
1725 ceph_msg_revoke_incoming(req->r_reply);
1726}
1727
1728static void finish_request(struct ceph_osd_request *req)
1729{
1730 __finish_request(req);
1731 ceph_osdc_put_request(req);
1732}
1733
1734static void __complete_request(struct ceph_osd_request *req)
1735{
1736 if (req->r_callback)
1737 req->r_callback(req);
1738 else
1739 complete_all(&req->r_completion);
1740}
1741
1742/*
1743 * Note that this is open-coded in handle_reply(), which has to deal
1744 * with ack vs commit, dup acks, etc.
1745 */
1746static void complete_request(struct ceph_osd_request *req, int err)
1747{
1748 dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
1749
1750 req->r_result = err;
1751 __finish_request(req);
1752 __complete_request(req);
1753 complete_all(&req->r_safe_completion);
1754 ceph_osdc_put_request(req);
1755}
1756
1757static void cancel_map_check(struct ceph_osd_request *req)
1758{
1759 struct ceph_osd_client *osdc = req->r_osdc;
1760 struct ceph_osd_request *lookup_req;
1761
1762 verify_osdc_wrlocked(osdc);
1763
1764 lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
1765 if (!lookup_req)
1346 return; 1766 return;
1767
1768 WARN_ON(lookup_req != req);
1769 erase_request_mc(&osdc->map_checks, req);
1770 ceph_osdc_put_request(req);
1771}
1772
1773static void cancel_request(struct ceph_osd_request *req)
1774{
1775 dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
1776
1777 cancel_map_check(req);
1778 finish_request(req);
1779}
1780
1781static void check_pool_dne(struct ceph_osd_request *req)
1782{
1783 struct ceph_osd_client *osdc = req->r_osdc;
1784 struct ceph_osdmap *map = osdc->osdmap;
1785
1786 verify_osdc_wrlocked(osdc);
1787 WARN_ON(!map->epoch);
1788
1789 if (req->r_attempts) {
1790 /*
1791 * We sent a request earlier, which means that
1792 * previously the pool existed, and now it does not
1793 * (i.e., it was deleted).
1794 */
1795 req->r_map_dne_bound = map->epoch;
1796 dout("%s req %p tid %llu pool disappeared\n", __func__, req,
1797 req->r_tid);
1798 } else {
1799 dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__,
1800 req, req->r_tid, req->r_map_dne_bound, map->epoch);
1347 } 1801 }
1348 1802
1349 dout("%s %p tid %llu\n", __func__, req, req->r_tid); 1803 if (req->r_map_dne_bound) {
1350 list_del_init(&req->r_linger_item); 1804 if (map->epoch >= req->r_map_dne_bound) {
1805 /* we had a new enough map */
1806 pr_info_ratelimited("tid %llu pool does not exist\n",
1807 req->r_tid);
1808 complete_request(req, -ENOENT);
1809 }
1810 } else {
1811 send_map_check(req);
1812 }
1813}
1351 1814
1352 if (req->r_osd) { 1815static void map_check_cb(struct ceph_mon_generic_request *greq)
1353 list_del_init(&req->r_linger_osd_item); 1816{
1354 maybe_move_osd_to_lru(osdc, req->r_osd); 1817 struct ceph_osd_client *osdc = &greq->monc->client->osdc;
1355 if (list_empty(&req->r_osd_item)) 1818 struct ceph_osd_request *req;
1356 req->r_osd = NULL; 1819 u64 tid = greq->private_data;
1820
1821 WARN_ON(greq->result || !greq->u.newest);
1822
1823 down_write(&osdc->lock);
1824 req = lookup_request_mc(&osdc->map_checks, tid);
1825 if (!req) {
1826 dout("%s tid %llu dne\n", __func__, tid);
1827 goto out_unlock;
1357 } 1828 }
1829
1830 dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__,
1831 req, req->r_tid, req->r_map_dne_bound, greq->u.newest);
1832 if (!req->r_map_dne_bound)
1833 req->r_map_dne_bound = greq->u.newest;
1834 erase_request_mc(&osdc->map_checks, req);
1835 check_pool_dne(req);
1836
1358 ceph_osdc_put_request(req); 1837 ceph_osdc_put_request(req);
1838out_unlock:
1839 up_write(&osdc->lock);
1359} 1840}
1360 1841
1361void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, 1842static void send_map_check(struct ceph_osd_request *req)
1362 struct ceph_osd_request *req)
1363{ 1843{
1364 if (!req->r_linger) { 1844 struct ceph_osd_client *osdc = req->r_osdc;
1365 dout("set_request_linger %p\n", req); 1845 struct ceph_osd_request *lookup_req;
1366 req->r_linger = 1; 1846 int ret;
1847
1848 verify_osdc_wrlocked(osdc);
1849
1850 lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid);
1851 if (lookup_req) {
1852 WARN_ON(lookup_req != req);
1853 return;
1367 } 1854 }
1855
1856 ceph_osdc_get_request(req);
1857 insert_request_mc(&osdc->map_checks, req);
1858 ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
1859 map_check_cb, req->r_tid);
1860 WARN_ON(ret);
1368} 1861}
1369EXPORT_SYMBOL(ceph_osdc_set_request_linger);
1370 1862
1371/* 1863/*
1372 * Returns whether a request should be blocked from being sent 1864 * lingering requests, watch/notify v2 infrastructure
1373 * based on the current osdmap and osd_client settings.
1374 *
1375 * Caller should hold map_sem for read.
1376 */ 1865 */
1377static bool __req_should_be_paused(struct ceph_osd_client *osdc, 1866static void linger_release(struct kref *kref)
1378 struct ceph_osd_request *req)
1379{ 1867{
1380 bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); 1868 struct ceph_osd_linger_request *lreq =
1381 bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || 1869 container_of(kref, struct ceph_osd_linger_request, kref);
1382 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); 1870
1383 return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) || 1871 dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq,
1384 (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); 1872 lreq->reg_req, lreq->ping_req);
1873 WARN_ON(!RB_EMPTY_NODE(&lreq->node));
1874 WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node));
1875 WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node));
1876 WARN_ON(!list_empty(&lreq->scan_item));
1877 WARN_ON(!list_empty(&lreq->pending_lworks));
1878 WARN_ON(lreq->osd);
1879
1880 if (lreq->reg_req)
1881 ceph_osdc_put_request(lreq->reg_req);
1882 if (lreq->ping_req)
1883 ceph_osdc_put_request(lreq->ping_req);
1884 target_destroy(&lreq->t);
1885 kfree(lreq);
1385} 1886}
1386 1887
1888static void linger_put(struct ceph_osd_linger_request *lreq)
1889{
1890 if (lreq)
1891 kref_put(&lreq->kref, linger_release);
1892}
1893
1894static struct ceph_osd_linger_request *
1895linger_get(struct ceph_osd_linger_request *lreq)
1896{
1897 kref_get(&lreq->kref);
1898 return lreq;
1899}
1900
1901static struct ceph_osd_linger_request *
1902linger_alloc(struct ceph_osd_client *osdc)
1903{
1904 struct ceph_osd_linger_request *lreq;
1905
1906 lreq = kzalloc(sizeof(*lreq), GFP_NOIO);
1907 if (!lreq)
1908 return NULL;
1909
1910 kref_init(&lreq->kref);
1911 mutex_init(&lreq->lock);
1912 RB_CLEAR_NODE(&lreq->node);
1913 RB_CLEAR_NODE(&lreq->osdc_node);
1914 RB_CLEAR_NODE(&lreq->mc_node);
1915 INIT_LIST_HEAD(&lreq->scan_item);
1916 INIT_LIST_HEAD(&lreq->pending_lworks);
1917 init_completion(&lreq->reg_commit_wait);
1918 init_completion(&lreq->notify_finish_wait);
1919
1920 lreq->osdc = osdc;
1921 target_init(&lreq->t);
1922
1923 dout("%s lreq %p\n", __func__, lreq);
1924 return lreq;
1925}
1926
1927DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node)
1928DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node)
1929DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node)
1930
1387/* 1931/*
1388 * Calculate mapping of a request to a PG. Takes tiering into account. 1932 * Create linger request <-> OSD session relation.
1933 *
1934 * @lreq has to be registered, @osd may be homeless.
1389 */ 1935 */
1390static int __calc_request_pg(struct ceph_osdmap *osdmap, 1936static void link_linger(struct ceph_osd *osd,
1391 struct ceph_osd_request *req, 1937 struct ceph_osd_linger_request *lreq)
1392 struct ceph_pg *pg_out)
1393{ 1938{
1394 bool need_check_tiering; 1939 verify_osd_locked(osd);
1940 WARN_ON(!lreq->linger_id || lreq->osd);
1941 dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
1942 osd->o_osd, lreq, lreq->linger_id);
1395 1943
1396 need_check_tiering = false; 1944 if (!osd_homeless(osd))
1397 if (req->r_target_oloc.pool == -1) { 1945 __remove_osd_from_lru(osd);
1398 req->r_target_oloc = req->r_base_oloc; /* struct */ 1946 else
1399 need_check_tiering = true; 1947 atomic_inc(&osd->o_osdc->num_homeless);
1948
1949 get_osd(osd);
1950 insert_linger(&osd->o_linger_requests, lreq);
1951 lreq->osd = osd;
1952}
1953
1954static void unlink_linger(struct ceph_osd *osd,
1955 struct ceph_osd_linger_request *lreq)
1956{
1957 verify_osd_locked(osd);
1958 WARN_ON(lreq->osd != osd);
1959 dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd,
1960 osd->o_osd, lreq, lreq->linger_id);
1961
1962 lreq->osd = NULL;
1963 erase_linger(&osd->o_linger_requests, lreq);
1964 put_osd(osd);
1965
1966 if (!osd_homeless(osd))
1967 maybe_move_osd_to_lru(osd);
1968 else
1969 atomic_dec(&osd->o_osdc->num_homeless);
1970}
1971
1972static bool __linger_registered(struct ceph_osd_linger_request *lreq)
1973{
1974 verify_osdc_locked(lreq->osdc);
1975
1976 return !RB_EMPTY_NODE(&lreq->osdc_node);
1977}
1978
1979static bool linger_registered(struct ceph_osd_linger_request *lreq)
1980{
1981 struct ceph_osd_client *osdc = lreq->osdc;
1982 bool registered;
1983
1984 down_read(&osdc->lock);
1985 registered = __linger_registered(lreq);
1986 up_read(&osdc->lock);
1987
1988 return registered;
1989}
1990
1991static void linger_register(struct ceph_osd_linger_request *lreq)
1992{
1993 struct ceph_osd_client *osdc = lreq->osdc;
1994
1995 verify_osdc_wrlocked(osdc);
1996 WARN_ON(lreq->linger_id);
1997
1998 linger_get(lreq);
1999 lreq->linger_id = ++osdc->last_linger_id;
2000 insert_linger_osdc(&osdc->linger_requests, lreq);
2001}
2002
2003static void linger_unregister(struct ceph_osd_linger_request *lreq)
2004{
2005 struct ceph_osd_client *osdc = lreq->osdc;
2006
2007 verify_osdc_wrlocked(osdc);
2008
2009 erase_linger_osdc(&osdc->linger_requests, lreq);
2010 linger_put(lreq);
2011}
2012
2013static void cancel_linger_request(struct ceph_osd_request *req)
2014{
2015 struct ceph_osd_linger_request *lreq = req->r_priv;
2016
2017 WARN_ON(!req->r_linger);
2018 cancel_request(req);
2019 linger_put(lreq);
2020}
2021
2022struct linger_work {
2023 struct work_struct work;
2024 struct ceph_osd_linger_request *lreq;
2025 struct list_head pending_item;
2026 unsigned long queued_stamp;
2027
2028 union {
2029 struct {
2030 u64 notify_id;
2031 u64 notifier_id;
2032 void *payload; /* points into @msg front */
2033 size_t payload_len;
2034
2035 struct ceph_msg *msg; /* for ceph_msg_put() */
2036 } notify;
2037 struct {
2038 int err;
2039 } error;
2040 };
2041};
2042
2043static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq,
2044 work_func_t workfn)
2045{
2046 struct linger_work *lwork;
2047
2048 lwork = kzalloc(sizeof(*lwork), GFP_NOIO);
2049 if (!lwork)
2050 return NULL;
2051
2052 INIT_WORK(&lwork->work, workfn);
2053 INIT_LIST_HEAD(&lwork->pending_item);
2054 lwork->lreq = linger_get(lreq);
2055
2056 return lwork;
2057}
2058
2059static void lwork_free(struct linger_work *lwork)
2060{
2061 struct ceph_osd_linger_request *lreq = lwork->lreq;
2062
2063 mutex_lock(&lreq->lock);
2064 list_del(&lwork->pending_item);
2065 mutex_unlock(&lreq->lock);
2066
2067 linger_put(lreq);
2068 kfree(lwork);
2069}
2070
2071static void lwork_queue(struct linger_work *lwork)
2072{
2073 struct ceph_osd_linger_request *lreq = lwork->lreq;
2074 struct ceph_osd_client *osdc = lreq->osdc;
2075
2076 verify_lreq_locked(lreq);
2077 WARN_ON(!list_empty(&lwork->pending_item));
2078
2079 lwork->queued_stamp = jiffies;
2080 list_add_tail(&lwork->pending_item, &lreq->pending_lworks);
2081 queue_work(osdc->notify_wq, &lwork->work);
2082}
2083
2084static void do_watch_notify(struct work_struct *w)
2085{
2086 struct linger_work *lwork = container_of(w, struct linger_work, work);
2087 struct ceph_osd_linger_request *lreq = lwork->lreq;
2088
2089 if (!linger_registered(lreq)) {
2090 dout("%s lreq %p not registered\n", __func__, lreq);
2091 goto out;
1400 } 2092 }
1401 if (req->r_target_oid.name_len == 0) { 2093
1402 ceph_oid_copy(&req->r_target_oid, &req->r_base_oid); 2094 WARN_ON(!lreq->is_watch);
1403 need_check_tiering = true; 2095 dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n",
2096 __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id,
2097 lwork->notify.payload_len);
2098 lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id,
2099 lwork->notify.notifier_id, lwork->notify.payload,
2100 lwork->notify.payload_len);
2101
2102out:
2103 ceph_msg_put(lwork->notify.msg);
2104 lwork_free(lwork);
2105}
2106
2107static void do_watch_error(struct work_struct *w)
2108{
2109 struct linger_work *lwork = container_of(w, struct linger_work, work);
2110 struct ceph_osd_linger_request *lreq = lwork->lreq;
2111
2112 if (!linger_registered(lreq)) {
2113 dout("%s lreq %p not registered\n", __func__, lreq);
2114 goto out;
1404 } 2115 }
1405 2116
1406 if (need_check_tiering && 2117 dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err);
1407 (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { 2118 lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err);
1408 struct ceph_pg_pool_info *pi; 2119
1409 2120out:
1410 pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool); 2121 lwork_free(lwork);
1411 if (pi) { 2122}
1412 if ((req->r_flags & CEPH_OSD_FLAG_READ) && 2123
1413 pi->read_tier >= 0) 2124static void queue_watch_error(struct ceph_osd_linger_request *lreq)
1414 req->r_target_oloc.pool = pi->read_tier; 2125{
1415 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && 2126 struct linger_work *lwork;
1416 pi->write_tier >= 0) 2127
1417 req->r_target_oloc.pool = pi->write_tier; 2128 lwork = lwork_alloc(lreq, do_watch_error);
2129 if (!lwork) {
2130 pr_err("failed to allocate error-lwork\n");
2131 return;
2132 }
2133
2134 lwork->error.err = lreq->last_error;
2135 lwork_queue(lwork);
2136}
2137
2138static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq,
2139 int result)
2140{
2141 if (!completion_done(&lreq->reg_commit_wait)) {
2142 lreq->reg_commit_error = (result <= 0 ? result : 0);
2143 complete_all(&lreq->reg_commit_wait);
2144 }
2145}
2146
2147static void linger_commit_cb(struct ceph_osd_request *req)
2148{
2149 struct ceph_osd_linger_request *lreq = req->r_priv;
2150
2151 mutex_lock(&lreq->lock);
2152 dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
2153 lreq->linger_id, req->r_result);
2154 WARN_ON(!__linger_registered(lreq));
2155 linger_reg_commit_complete(lreq, req->r_result);
2156 lreq->committed = true;
2157
2158 if (!lreq->is_watch) {
2159 struct ceph_osd_data *osd_data =
2160 osd_req_op_data(req, 0, notify, response_data);
2161 void *p = page_address(osd_data->pages[0]);
2162
2163 WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY ||
2164 osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
2165
2166 /* make note of the notify_id */
2167 if (req->r_ops[0].outdata_len >= sizeof(u64)) {
2168 lreq->notify_id = ceph_decode_64(&p);
2169 dout("lreq %p notify_id %llu\n", lreq,
2170 lreq->notify_id);
2171 } else {
2172 dout("lreq %p no notify_id\n", lreq);
1418 } 2173 }
1419 /* !pi is caught in ceph_oloc_oid_to_pg() */
1420 } 2174 }
1421 2175
1422 return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc, 2176 mutex_unlock(&lreq->lock);
1423 &req->r_target_oid, pg_out); 2177 linger_put(lreq);
1424} 2178}
1425 2179
1426static void __enqueue_request(struct ceph_osd_request *req) 2180static int normalize_watch_error(int err)
1427{ 2181{
1428 struct ceph_osd_client *osdc = req->r_osdc; 2182 /*
2183 * Translate ENOENT -> ENOTCONN so that a delete->disconnection
2184 * notification and a failure to reconnect because we raced with
2185 * the delete appear the same to the user.
2186 */
2187 if (err == -ENOENT)
2188 err = -ENOTCONN;
2189
2190 return err;
2191}
2192
2193static void linger_reconnect_cb(struct ceph_osd_request *req)
2194{
2195 struct ceph_osd_linger_request *lreq = req->r_priv;
2196
2197 mutex_lock(&lreq->lock);
2198 dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
2199 lreq, lreq->linger_id, req->r_result, lreq->last_error);
2200 if (req->r_result < 0) {
2201 if (!lreq->last_error) {
2202 lreq->last_error = normalize_watch_error(req->r_result);
2203 queue_watch_error(lreq);
2204 }
2205 }
1429 2206
1430 dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid, 2207 mutex_unlock(&lreq->lock);
1431 req->r_osd ? req->r_osd->o_osd : -1); 2208 linger_put(lreq);
2209}
2210
2211static void send_linger(struct ceph_osd_linger_request *lreq)
2212{
2213 struct ceph_osd_request *req = lreq->reg_req;
2214 struct ceph_osd_req_op *op = &req->r_ops[0];
1432 2215
1433 if (req->r_osd) { 2216 verify_osdc_wrlocked(req->r_osdc);
1434 __remove_osd_from_lru(req->r_osd); 2217 dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
1435 list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); 2218
1436 list_move_tail(&req->r_req_lru_item, &osdc->req_unsent); 2219 if (req->r_osd)
2220 cancel_linger_request(req);
2221
2222 request_reinit(req);
2223 ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
2224 ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
2225 req->r_flags = lreq->t.flags;
2226 req->r_mtime = lreq->mtime;
2227
2228 mutex_lock(&lreq->lock);
2229 if (lreq->is_watch && lreq->committed) {
2230 WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
2231 op->watch.cookie != lreq->linger_id);
2232 op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
2233 op->watch.gen = ++lreq->register_gen;
2234 dout("lreq %p reconnect register_gen %u\n", lreq,
2235 op->watch.gen);
2236 req->r_callback = linger_reconnect_cb;
1437 } else { 2237 } else {
1438 list_move_tail(&req->r_req_lru_item, &osdc->req_notarget); 2238 if (!lreq->is_watch)
2239 lreq->notify_id = 0;
2240 else
2241 WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
2242 dout("lreq %p register\n", lreq);
2243 req->r_callback = linger_commit_cb;
1439 } 2244 }
2245 mutex_unlock(&lreq->lock);
2246
2247 req->r_priv = linger_get(lreq);
2248 req->r_linger = true;
2249
2250 submit_request(req, true);
1440} 2251}
1441 2252
1442/* 2253static void linger_ping_cb(struct ceph_osd_request *req)
1443 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
1444 * (as needed), and set the request r_osd appropriately. If there is
1445 * no up osd, set r_osd to NULL. Move the request to the appropriate list
1446 * (unsent, homeless) or leave on in-flight lru.
1447 *
1448 * Return 0 if unchanged, 1 if changed, or negative on error.
1449 *
1450 * Caller should hold map_sem for read and request_mutex.
1451 */
1452static int __map_request(struct ceph_osd_client *osdc,
1453 struct ceph_osd_request *req, int force_resend)
1454{ 2254{
1455 struct ceph_pg pgid; 2255 struct ceph_osd_linger_request *lreq = req->r_priv;
1456 int acting[CEPH_PG_MAX_SIZE]; 2256
1457 int num, o; 2257 mutex_lock(&lreq->lock);
1458 int err; 2258 dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
1459 bool was_paused; 2259 __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
1460 2260 lreq->last_error);
1461 dout("map_request %p tid %lld\n", req, req->r_tid); 2261 if (lreq->register_gen == req->r_ops[0].watch.gen) {
1462 2262 if (!req->r_result) {
1463 err = __calc_request_pg(osdc->osdmap, req, &pgid); 2263 lreq->watch_valid_thru = lreq->ping_sent;
1464 if (err) { 2264 } else if (!lreq->last_error) {
1465 list_move(&req->r_req_lru_item, &osdc->req_notarget); 2265 lreq->last_error = normalize_watch_error(req->r_result);
1466 return err; 2266 queue_watch_error(lreq);
1467 }
1468 req->r_pgid = pgid;
1469
1470 num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
1471 if (num < 0)
1472 num = 0;
1473
1474 was_paused = req->r_paused;
1475 req->r_paused = __req_should_be_paused(osdc, req);
1476 if (was_paused && !req->r_paused)
1477 force_resend = 1;
1478
1479 if ((!force_resend &&
1480 req->r_osd && req->r_osd->o_osd == o &&
1481 req->r_sent >= req->r_osd->o_incarnation &&
1482 req->r_num_pg_osds == num &&
1483 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
1484 (req->r_osd == NULL && o == -1) ||
1485 req->r_paused)
1486 return 0; /* no change */
1487
1488 dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
1489 req->r_tid, pgid.pool, pgid.seed, o,
1490 req->r_osd ? req->r_osd->o_osd : -1);
1491
1492 /* record full pg acting set */
1493 memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
1494 req->r_num_pg_osds = num;
1495
1496 if (req->r_osd) {
1497 __cancel_request(req);
1498 list_del_init(&req->r_osd_item);
1499 list_del_init(&req->r_linger_osd_item);
1500 req->r_osd = NULL;
1501 }
1502
1503 req->r_osd = __lookup_osd(osdc, o);
1504 if (!req->r_osd && o >= 0) {
1505 err = -ENOMEM;
1506 req->r_osd = create_osd(osdc, o);
1507 if (!req->r_osd) {
1508 list_move(&req->r_req_lru_item, &osdc->req_notarget);
1509 goto out;
1510 } 2267 }
2268 } else {
2269 dout("lreq %p register_gen %u ignoring old pong %u\n", lreq,
2270 lreq->register_gen, req->r_ops[0].watch.gen);
2271 }
1511 2272
1512 dout("map_request osd %p is osd%d\n", req->r_osd, o); 2273 mutex_unlock(&lreq->lock);
1513 __insert_osd(osdc, req->r_osd); 2274 linger_put(lreq);
2275}
2276
2277static void send_linger_ping(struct ceph_osd_linger_request *lreq)
2278{
2279 struct ceph_osd_client *osdc = lreq->osdc;
2280 struct ceph_osd_request *req = lreq->ping_req;
2281 struct ceph_osd_req_op *op = &req->r_ops[0];
1514 2282
1515 ceph_con_open(&req->r_osd->o_con, 2283 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) {
1516 CEPH_ENTITY_TYPE_OSD, o, 2284 dout("%s PAUSERD\n", __func__);
1517 &osdc->osdmap->osd_addr[o]); 2285 return;
1518 } 2286 }
1519 2287
1520 __enqueue_request(req); 2288 lreq->ping_sent = jiffies;
1521 err = 1; /* osd or pg changed */ 2289 dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n",
2290 __func__, lreq, lreq->linger_id, lreq->ping_sent,
2291 lreq->register_gen);
1522 2292
1523out: 2293 if (req->r_osd)
1524 return err; 2294 cancel_linger_request(req);
2295
2296 request_reinit(req);
2297 target_copy(&req->r_t, &lreq->t);
2298
2299 WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
2300 op->watch.cookie != lreq->linger_id ||
2301 op->watch.op != CEPH_OSD_WATCH_OP_PING);
2302 op->watch.gen = lreq->register_gen;
2303 req->r_callback = linger_ping_cb;
2304 req->r_priv = linger_get(lreq);
2305 req->r_linger = true;
2306
2307 ceph_osdc_get_request(req);
2308 account_request(req);
2309 req->r_tid = atomic64_inc_return(&osdc->last_tid);
2310 link_request(lreq->osd, req);
2311 send_request(req);
1525} 2312}
1526 2313
1527/* 2314static void linger_submit(struct ceph_osd_linger_request *lreq)
1528 * caller should hold map_sem (for read) and request_mutex
1529 */
1530static void __send_request(struct ceph_osd_client *osdc,
1531 struct ceph_osd_request *req)
1532{ 2315{
1533 void *p; 2316 struct ceph_osd_client *osdc = lreq->osdc;
2317 struct ceph_osd *osd;
1534 2318
1535 dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n", 2319 calc_target(osdc, &lreq->t, &lreq->last_force_resend, false);
1536 req, req->r_tid, req->r_osd->o_osd, req->r_flags, 2320 osd = lookup_create_osd(osdc, lreq->t.osd, true);
1537 (unsigned long long)req->r_pgid.pool, req->r_pgid.seed); 2321 link_linger(osd, lreq);
1538 2322
1539 /* fill in message content that changes each time we send it */ 2323 send_linger(lreq);
1540 put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); 2324}
1541 put_unaligned_le32(req->r_flags, req->r_request_flags);
1542 put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool);
1543 p = req->r_request_pgid;
1544 ceph_encode_64(&p, req->r_pgid.pool);
1545 ceph_encode_32(&p, req->r_pgid.seed);
1546 put_unaligned_le64(1, req->r_request_attempts); /* FIXME */
1547 memcpy(req->r_request_reassert_version, &req->r_reassert_version,
1548 sizeof(req->r_reassert_version));
1549 2325
1550 req->r_stamp = jiffies; 2326static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
1551 list_move_tail(&req->r_req_lru_item, &osdc->req_lru); 2327{
2328 struct ceph_osd_client *osdc = lreq->osdc;
2329 struct ceph_osd_linger_request *lookup_lreq;
1552 2330
1553 ceph_msg_get(req->r_request); /* send consumes a ref */ 2331 verify_osdc_wrlocked(osdc);
1554 2332
1555 req->r_sent = req->r_osd->o_incarnation; 2333 lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
2334 lreq->linger_id);
2335 if (!lookup_lreq)
2336 return;
1556 2337
1557 ceph_con_send(&req->r_osd->o_con, req->r_request); 2338 WARN_ON(lookup_lreq != lreq);
2339 erase_linger_mc(&osdc->linger_map_checks, lreq);
2340 linger_put(lreq);
1558} 2341}
1559 2342
1560/* 2343/*
1561 * Send any requests in the queue (req_unsent). 2344 * @lreq has to be both registered and linked.
1562 */ 2345 */
1563static void __send_queued(struct ceph_osd_client *osdc) 2346static void __linger_cancel(struct ceph_osd_linger_request *lreq)
2347{
2348 if (lreq->is_watch && lreq->ping_req->r_osd)
2349 cancel_linger_request(lreq->ping_req);
2350 if (lreq->reg_req->r_osd)
2351 cancel_linger_request(lreq->reg_req);
2352 cancel_linger_map_check(lreq);
2353 unlink_linger(lreq->osd, lreq);
2354 linger_unregister(lreq);
2355}
2356
2357static void linger_cancel(struct ceph_osd_linger_request *lreq)
1564{ 2358{
1565 struct ceph_osd_request *req, *tmp; 2359 struct ceph_osd_client *osdc = lreq->osdc;
1566 2360
1567 dout("__send_queued\n"); 2361 down_write(&osdc->lock);
1568 list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) 2362 if (__linger_registered(lreq))
1569 __send_request(osdc, req); 2363 __linger_cancel(lreq);
2364 up_write(&osdc->lock);
1570} 2365}
1571 2366
1572/* 2367static void send_linger_map_check(struct ceph_osd_linger_request *lreq);
1573 * Caller should hold map_sem for read and request_mutex. 2368
1574 */ 2369static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq)
1575static int __ceph_osdc_start_request(struct ceph_osd_client *osdc, 2370{
1576 struct ceph_osd_request *req, 2371 struct ceph_osd_client *osdc = lreq->osdc;
1577 bool nofail) 2372 struct ceph_osdmap *map = osdc->osdmap;
1578{ 2373
1579 int rc; 2374 verify_osdc_wrlocked(osdc);
1580 2375 WARN_ON(!map->epoch);
1581 __register_request(osdc, req); 2376
1582 req->r_sent = 0; 2377 if (lreq->register_gen) {
1583 req->r_got_reply = 0; 2378 lreq->map_dne_bound = map->epoch;
1584 rc = __map_request(osdc, req, 0); 2379 dout("%s lreq %p linger_id %llu pool disappeared\n", __func__,
1585 if (rc < 0) { 2380 lreq, lreq->linger_id);
1586 if (nofail) { 2381 } else {
1587 dout("osdc_start_request failed map, " 2382 dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n",
1588 " will retry %lld\n", req->r_tid); 2383 __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
1589 rc = 0; 2384 map->epoch);
1590 } else {
1591 __unregister_request(osdc, req);
1592 }
1593 return rc;
1594 } 2385 }
1595 2386
1596 if (req->r_osd == NULL) { 2387 if (lreq->map_dne_bound) {
1597 dout("send_request %p no up osds in pg\n", req); 2388 if (map->epoch >= lreq->map_dne_bound) {
1598 ceph_monc_request_next_osdmap(&osdc->client->monc); 2389 /* we had a new enough map */
2390 pr_info("linger_id %llu pool does not exist\n",
2391 lreq->linger_id);
2392 linger_reg_commit_complete(lreq, -ENOENT);
2393 __linger_cancel(lreq);
2394 }
1599 } else { 2395 } else {
1600 __send_queued(osdc); 2396 send_linger_map_check(lreq);
1601 } 2397 }
2398}
1602 2399
1603 return 0; 2400static void linger_map_check_cb(struct ceph_mon_generic_request *greq)
2401{
2402 struct ceph_osd_client *osdc = &greq->monc->client->osdc;
2403 struct ceph_osd_linger_request *lreq;
2404 u64 linger_id = greq->private_data;
2405
2406 WARN_ON(greq->result || !greq->u.newest);
2407
2408 down_write(&osdc->lock);
2409 lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id);
2410 if (!lreq) {
2411 dout("%s linger_id %llu dne\n", __func__, linger_id);
2412 goto out_unlock;
2413 }
2414
2415 dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n",
2416 __func__, lreq, lreq->linger_id, lreq->map_dne_bound,
2417 greq->u.newest);
2418 if (!lreq->map_dne_bound)
2419 lreq->map_dne_bound = greq->u.newest;
2420 erase_linger_mc(&osdc->linger_map_checks, lreq);
2421 check_linger_pool_dne(lreq);
2422
2423 linger_put(lreq);
2424out_unlock:
2425 up_write(&osdc->lock);
2426}
2427
2428static void send_linger_map_check(struct ceph_osd_linger_request *lreq)
2429{
2430 struct ceph_osd_client *osdc = lreq->osdc;
2431 struct ceph_osd_linger_request *lookup_lreq;
2432 int ret;
2433
2434 verify_osdc_wrlocked(osdc);
2435
2436 lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks,
2437 lreq->linger_id);
2438 if (lookup_lreq) {
2439 WARN_ON(lookup_lreq != lreq);
2440 return;
2441 }
2442
2443 linger_get(lreq);
2444 insert_linger_mc(&osdc->linger_map_checks, lreq);
2445 ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap",
2446 linger_map_check_cb, lreq->linger_id);
2447 WARN_ON(ret);
2448}
2449
2450static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
2451{
2452 int ret;
2453
2454 dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
2455 ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
2456 return ret ?: lreq->reg_commit_error;
2457}
2458
2459static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
2460{
2461 int ret;
2462
2463 dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
2464 ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
2465 return ret ?: lreq->notify_finish_error;
1604} 2466}
1605 2467
1606/* 2468/*
1607 * Timeout callback, called every N seconds when 1 or more osd 2469 * Timeout callback, called every N seconds. When 1 or more OSD
1608 * requests has been active for more than N seconds. When this 2470 * requests has been active for more than N seconds, we send a keepalive
1609 * happens, we ping all OSDs with requests who have timed out to 2471 * (tag + timestamp) to its OSD to ensure any communications channel
1610 * ensure any communications channel reset is detected. Reset the 2472 * reset is detected.
1611 * request timeouts another N seconds in the future as we go.
1612 * Reschedule the timeout event another N seconds in future (unless
1613 * there are no open requests).
1614 */ 2473 */
1615static void handle_timeout(struct work_struct *work) 2474static void handle_timeout(struct work_struct *work)
1616{ 2475{
1617 struct ceph_osd_client *osdc = 2476 struct ceph_osd_client *osdc =
1618 container_of(work, struct ceph_osd_client, timeout_work.work); 2477 container_of(work, struct ceph_osd_client, timeout_work.work);
1619 struct ceph_options *opts = osdc->client->options; 2478 struct ceph_options *opts = osdc->client->options;
1620 struct ceph_osd_request *req; 2479 unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
1621 struct ceph_osd *osd; 2480 LIST_HEAD(slow_osds);
1622 struct list_head slow_osds; 2481 struct rb_node *n, *p;
1623 dout("timeout\n");
1624 down_read(&osdc->map_sem);
1625
1626 ceph_monc_request_next_osdmap(&osdc->client->monc);
1627 2482
1628 mutex_lock(&osdc->request_mutex); 2483 dout("%s osdc %p\n", __func__, osdc);
2484 down_write(&osdc->lock);
1629 2485
1630 /* 2486 /*
1631 * ping osds that are a bit slow. this ensures that if there 2487 * ping osds that are a bit slow. this ensures that if there
1632 * is a break in the TCP connection we will notice, and reopen 2488 * is a break in the TCP connection we will notice, and reopen
1633 * a connection with that osd (from the fault callback). 2489 * a connection with that osd (from the fault callback).
1634 */ 2490 */
1635 INIT_LIST_HEAD(&slow_osds); 2491 for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
1636 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { 2492 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
1637 if (time_before(jiffies, 2493 bool found = false;
1638 req->r_stamp + opts->osd_keepalive_timeout)) 2494
1639 break; 2495 for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
2496 struct ceph_osd_request *req =
2497 rb_entry(p, struct ceph_osd_request, r_node);
2498
2499 if (time_before(req->r_stamp, cutoff)) {
2500 dout(" req %p tid %llu on osd%d is laggy\n",
2501 req, req->r_tid, osd->o_osd);
2502 found = true;
2503 }
2504 }
2505 for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
2506 struct ceph_osd_linger_request *lreq =
2507 rb_entry(p, struct ceph_osd_linger_request, node);
2508
2509 dout(" lreq %p linger_id %llu is served by osd%d\n",
2510 lreq, lreq->linger_id, osd->o_osd);
2511 found = true;
2512
2513 mutex_lock(&lreq->lock);
2514 if (lreq->is_watch && lreq->committed && !lreq->last_error)
2515 send_linger_ping(lreq);
2516 mutex_unlock(&lreq->lock);
2517 }
1640 2518
1641 osd = req->r_osd; 2519 if (found)
1642 BUG_ON(!osd); 2520 list_move_tail(&osd->o_keepalive_item, &slow_osds);
1643 dout(" tid %llu is slow, will send keepalive on osd%d\n",
1644 req->r_tid, osd->o_osd);
1645 list_move_tail(&osd->o_keepalive_item, &slow_osds);
1646 } 2521 }
2522
2523 if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
2524 maybe_request_map(osdc);
2525
1647 while (!list_empty(&slow_osds)) { 2526 while (!list_empty(&slow_osds)) {
1648 osd = list_entry(slow_osds.next, struct ceph_osd, 2527 struct ceph_osd *osd = list_first_entry(&slow_osds,
1649 o_keepalive_item); 2528 struct ceph_osd,
2529 o_keepalive_item);
1650 list_del_init(&osd->o_keepalive_item); 2530 list_del_init(&osd->o_keepalive_item);
1651 ceph_con_keepalive(&osd->o_con); 2531 ceph_con_keepalive(&osd->o_con);
1652 } 2532 }
1653 2533
1654 __schedule_osd_timeout(osdc); 2534 up_write(&osdc->lock);
1655 __send_queued(osdc); 2535 schedule_delayed_work(&osdc->timeout_work,
1656 mutex_unlock(&osdc->request_mutex); 2536 osdc->client->options->osd_keepalive_timeout);
1657 up_read(&osdc->map_sem);
1658} 2537}
1659 2538
1660static void handle_osds_timeout(struct work_struct *work) 2539static void handle_osds_timeout(struct work_struct *work)
@@ -1663,12 +2542,20 @@ static void handle_osds_timeout(struct work_struct *work)
1663 container_of(work, struct ceph_osd_client, 2542 container_of(work, struct ceph_osd_client,
1664 osds_timeout_work.work); 2543 osds_timeout_work.work);
1665 unsigned long delay = osdc->client->options->osd_idle_ttl / 4; 2544 unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
2545 struct ceph_osd *osd, *nosd;
1666 2546
1667 dout("osds timeout\n"); 2547 dout("%s osdc %p\n", __func__, osdc);
1668 down_read(&osdc->map_sem); 2548 down_write(&osdc->lock);
1669 remove_old_osds(osdc); 2549 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
1670 up_read(&osdc->map_sem); 2550 if (time_before(jiffies, osd->lru_ttl))
2551 break;
1671 2552
2553 WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
2554 WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
2555 close_osd(osd);
2556 }
2557
2558 up_write(&osdc->lock);
1672 schedule_delayed_work(&osdc->osds_timeout_work, 2559 schedule_delayed_work(&osdc->osds_timeout_work,
1673 round_jiffies_relative(delay)); 2560 round_jiffies_relative(delay));
1674} 2561}
@@ -1776,107 +2663,76 @@ e_inval:
1776 goto out; 2663 goto out;
1777} 2664}
1778 2665
1779static void complete_request(struct ceph_osd_request *req) 2666struct MOSDOpReply {
1780{ 2667 struct ceph_pg pgid;
1781 complete_all(&req->r_safe_completion); /* fsync waiter */ 2668 u64 flags;
1782} 2669 int result;
2670 u32 epoch;
2671 int num_ops;
2672 u32 outdata_len[CEPH_OSD_MAX_OPS];
2673 s32 rval[CEPH_OSD_MAX_OPS];
2674 int retry_attempt;
2675 struct ceph_eversion replay_version;
2676 u64 user_version;
2677 struct ceph_request_redirect redirect;
2678};
1783 2679
1784/* 2680static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m)
1785 * handle osd op reply. either call the callback if it is specified,
1786 * or do the completion to wake up the waiting thread.
1787 */
1788static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1789{ 2681{
1790 void *p, *end; 2682 void *p = msg->front.iov_base;
1791 struct ceph_osd_request *req; 2683 void *const end = p + msg->front.iov_len;
1792 struct ceph_request_redirect redir; 2684 u16 version = le16_to_cpu(msg->hdr.version);
1793 u64 tid; 2685 struct ceph_eversion bad_replay_version;
1794 int object_len;
1795 unsigned int numops;
1796 int payload_len, flags;
1797 s32 result;
1798 s32 retry_attempt;
1799 struct ceph_pg pg;
1800 int err;
1801 u32 reassert_epoch;
1802 u64 reassert_version;
1803 u32 osdmap_epoch;
1804 int already_completed;
1805 u32 bytes;
1806 u8 decode_redir; 2686 u8 decode_redir;
1807 unsigned int i; 2687 u32 len;
1808 2688 int ret;
1809 tid = le64_to_cpu(msg->hdr.tid); 2689 int i;
1810 dout("handle_reply %p tid %llu\n", msg, tid);
1811 2690
1812 p = msg->front.iov_base; 2691 ceph_decode_32_safe(&p, end, len, e_inval);
1813 end = p + msg->front.iov_len; 2692 ceph_decode_need(&p, end, len, e_inval);
2693 p += len; /* skip oid */
1814 2694
1815 ceph_decode_need(&p, end, 4, bad); 2695 ret = ceph_decode_pgid(&p, end, &m->pgid);
1816 object_len = ceph_decode_32(&p); 2696 if (ret)
1817 ceph_decode_need(&p, end, object_len, bad); 2697 return ret;
1818 p += object_len;
1819 2698
1820 err = ceph_decode_pgid(&p, end, &pg); 2699 ceph_decode_64_safe(&p, end, m->flags, e_inval);
1821 if (err) 2700 ceph_decode_32_safe(&p, end, m->result, e_inval);
1822 goto bad; 2701 ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval);
2702 memcpy(&bad_replay_version, p, sizeof(bad_replay_version));
2703 p += sizeof(bad_replay_version);
2704 ceph_decode_32_safe(&p, end, m->epoch, e_inval);
1823 2705
1824 ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad); 2706 ceph_decode_32_safe(&p, end, m->num_ops, e_inval);
1825 flags = ceph_decode_64(&p); 2707 if (m->num_ops > ARRAY_SIZE(m->outdata_len))
1826 result = ceph_decode_32(&p); 2708 goto e_inval;
1827 reassert_epoch = ceph_decode_32(&p);
1828 reassert_version = ceph_decode_64(&p);
1829 osdmap_epoch = ceph_decode_32(&p);
1830
1831 /* lookup */
1832 down_read(&osdc->map_sem);
1833 mutex_lock(&osdc->request_mutex);
1834 req = __lookup_request(osdc, tid);
1835 if (req == NULL) {
1836 dout("handle_reply tid %llu dne\n", tid);
1837 goto bad_mutex;
1838 }
1839 ceph_osdc_get_request(req);
1840 2709
1841 dout("handle_reply %p tid %llu req %p result %d\n", msg, tid, 2710 ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op),
1842 req, result); 2711 e_inval);
1843 2712 for (i = 0; i < m->num_ops; i++) {
1844 ceph_decode_need(&p, end, 4, bad_put);
1845 numops = ceph_decode_32(&p);
1846 if (numops > CEPH_OSD_MAX_OPS)
1847 goto bad_put;
1848 if (numops != req->r_num_ops)
1849 goto bad_put;
1850 payload_len = 0;
1851 ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put);
1852 for (i = 0; i < numops; i++) {
1853 struct ceph_osd_op *op = p; 2713 struct ceph_osd_op *op = p;
1854 int len;
1855 2714
1856 len = le32_to_cpu(op->payload_len); 2715 m->outdata_len[i] = le32_to_cpu(op->payload_len);
1857 req->r_ops[i].outdata_len = len;
1858 dout(" op %d has %d bytes\n", i, len);
1859 payload_len += len;
1860 p += sizeof(*op); 2716 p += sizeof(*op);
1861 } 2717 }
1862 bytes = le32_to_cpu(msg->hdr.data_len);
1863 if (payload_len != bytes) {
1864 pr_warn("sum of op payload lens %d != data_len %d\n",
1865 payload_len, bytes);
1866 goto bad_put;
1867 }
1868 2718
1869 ceph_decode_need(&p, end, 4 + numops * 4, bad_put); 2719 ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval);
1870 retry_attempt = ceph_decode_32(&p); 2720 for (i = 0; i < m->num_ops; i++)
1871 for (i = 0; i < numops; i++) 2721 ceph_decode_32_safe(&p, end, m->rval[i], e_inval);
1872 req->r_ops[i].rval = ceph_decode_32(&p);
1873 2722
1874 if (le16_to_cpu(msg->hdr.version) >= 6) { 2723 if (version >= 5) {
1875 p += 8 + 4; /* skip replay_version */ 2724 ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval);
1876 p += 8; /* skip user_version */ 2725 memcpy(&m->replay_version, p, sizeof(m->replay_version));
2726 p += sizeof(m->replay_version);
2727 ceph_decode_64_safe(&p, end, m->user_version, e_inval);
2728 } else {
2729 m->replay_version = bad_replay_version; /* struct */
2730 m->user_version = le64_to_cpu(m->replay_version.version);
2731 }
1877 2732
1878 if (le16_to_cpu(msg->hdr.version) >= 7) 2733 if (version >= 6) {
1879 ceph_decode_8_safe(&p, end, decode_redir, bad_put); 2734 if (version >= 7)
2735 ceph_decode_8_safe(&p, end, decode_redir, e_inval);
1880 else 2736 else
1881 decode_redir = 1; 2737 decode_redir = 1;
1882 } else { 2738 } else {
@@ -1884,228 +2740,410 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1884 } 2740 }
1885 2741
1886 if (decode_redir) { 2742 if (decode_redir) {
1887 err = ceph_redirect_decode(&p, end, &redir); 2743 ret = ceph_redirect_decode(&p, end, &m->redirect);
1888 if (err) 2744 if (ret)
1889 goto bad_put; 2745 return ret;
1890 } else { 2746 } else {
1891 redir.oloc.pool = -1; 2747 ceph_oloc_init(&m->redirect.oloc);
1892 } 2748 }
1893 2749
1894 if (redir.oloc.pool != -1) { 2750 return 0;
1895 dout("redirect pool %lld\n", redir.oloc.pool);
1896
1897 __unregister_request(osdc, req);
1898
1899 req->r_target_oloc = redir.oloc; /* struct */
1900 2751
1901 /* 2752e_inval:
1902 * Start redirect requests with nofail=true. If 2753 return -EINVAL;
1903 * mapping fails, request will end up on the notarget 2754}
1904 * list, waiting for the new osdmap (which can take
1905 * a while), even though the original request mapped
1906 * successfully. In the future we might want to follow
1907 * original request's nofail setting here.
1908 */
1909 err = __ceph_osdc_start_request(osdc, req, true);
1910 BUG_ON(err);
1911 2755
1912 goto out_unlock; 2756/*
1913 } 2757 * We are done with @req if
2758 * - @m is a safe reply, or
2759 * - @m is an unsafe reply and we didn't want a safe one
2760 */
2761static bool done_request(const struct ceph_osd_request *req,
2762 const struct MOSDOpReply *m)
2763{
2764 return (m->result < 0 ||
2765 (m->flags & CEPH_OSD_FLAG_ONDISK) ||
2766 !(req->r_flags & CEPH_OSD_FLAG_ONDISK));
2767}
1914 2768
1915 already_completed = req->r_got_reply; 2769/*
1916 if (!req->r_got_reply) { 2770 * handle osd op reply. either call the callback if it is specified,
1917 req->r_result = result; 2771 * or do the completion to wake up the waiting thread.
1918 dout("handle_reply result %d bytes %d\n", req->r_result, 2772 *
1919 bytes); 2773 * ->r_unsafe_callback is set? yes no
1920 if (req->r_result == 0) 2774 *
1921 req->r_result = bytes; 2775 * first reply is OK (needed r_cb/r_completion, r_cb/r_completion,
2776 * any or needed/got safe) r_safe_completion r_safe_completion
2777 *
2778 * first reply is unsafe r_unsafe_cb(true) (nothing)
2779 *
2780 * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion,
2781 * r_safe_completion r_safe_completion
2782 */
2783static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
2784{
2785 struct ceph_osd_client *osdc = osd->o_osdc;
2786 struct ceph_osd_request *req;
2787 struct MOSDOpReply m;
2788 u64 tid = le64_to_cpu(msg->hdr.tid);
2789 u32 data_len = 0;
2790 bool already_acked;
2791 int ret;
2792 int i;
1922 2793
1923 /* in case this is a write and we need to replay, */ 2794 dout("%s msg %p tid %llu\n", __func__, msg, tid);
1924 req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch);
1925 req->r_reassert_version.version = cpu_to_le64(reassert_version);
1926 2795
1927 req->r_got_reply = 1; 2796 down_read(&osdc->lock);
1928 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { 2797 if (!osd_registered(osd)) {
1929 dout("handle_reply tid %llu dup ack\n", tid); 2798 dout("%s osd%d unknown\n", __func__, osd->o_osd);
1930 goto out_unlock; 2799 goto out_unlock_osdc;
1931 } 2800 }
2801 WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
1932 2802
1933 dout("handle_reply tid %llu flags %d\n", tid, flags); 2803 mutex_lock(&osd->lock);
2804 req = lookup_request(&osd->o_requests, tid);
2805 if (!req) {
2806 dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid);
2807 goto out_unlock_session;
2808 }
1934 2809
1935 if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) 2810 ret = decode_MOSDOpReply(msg, &m);
1936 __register_linger_request(osdc, req); 2811 if (ret) {
2812 pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
2813 req->r_tid, ret);
2814 ceph_msg_dump(msg);
2815 goto fail_request;
2816 }
2817 dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n",
2818 __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed,
2819 m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch),
2820 le64_to_cpu(m.replay_version.version), m.user_version);
2821
2822 if (m.retry_attempt >= 0) {
2823 if (m.retry_attempt != req->r_attempts - 1) {
2824 dout("req %p tid %llu retry_attempt %d != %d, ignoring\n",
2825 req, req->r_tid, m.retry_attempt,
2826 req->r_attempts - 1);
2827 goto out_unlock_session;
2828 }
2829 } else {
2830 WARN_ON(1); /* MOSDOpReply v4 is assumed */
2831 }
1937 2832
1938 /* either this is a read, or we got the safe response */ 2833 if (!ceph_oloc_empty(&m.redirect.oloc)) {
1939 if (result < 0 || 2834 dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid,
1940 (flags & CEPH_OSD_FLAG_ONDISK) || 2835 m.redirect.oloc.pool);
1941 ((flags & CEPH_OSD_FLAG_WRITE) == 0)) 2836 unlink_request(osd, req);
1942 __unregister_request(osdc, req); 2837 mutex_unlock(&osd->lock);
2838
2839 ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
2840 req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
2841 req->r_tid = 0;
2842 __submit_request(req, false);
2843 goto out_unlock_osdc;
2844 }
1943 2845
1944 mutex_unlock(&osdc->request_mutex); 2846 if (m.num_ops != req->r_num_ops) {
1945 up_read(&osdc->map_sem); 2847 pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
2848 req->r_num_ops, req->r_tid);
2849 goto fail_request;
2850 }
2851 for (i = 0; i < req->r_num_ops; i++) {
2852 dout(" req %p tid %llu op %d rval %d len %u\n", req,
2853 req->r_tid, i, m.rval[i], m.outdata_len[i]);
2854 req->r_ops[i].rval = m.rval[i];
2855 req->r_ops[i].outdata_len = m.outdata_len[i];
2856 data_len += m.outdata_len[i];
2857 }
2858 if (data_len != le32_to_cpu(msg->hdr.data_len)) {
2859 pr_err("sum of lens %u != %u for tid %llu\n", data_len,
2860 le32_to_cpu(msg->hdr.data_len), req->r_tid);
2861 goto fail_request;
2862 }
2863 dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__,
2864 req, req->r_tid, req->r_got_reply, m.result, data_len);
2865
2866 already_acked = req->r_got_reply;
2867 if (!already_acked) {
2868 req->r_result = m.result ?: data_len;
2869 req->r_replay_version = m.replay_version; /* struct */
2870 req->r_got_reply = true;
2871 } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) {
2872 dout("req %p tid %llu dup ack\n", req, req->r_tid);
2873 goto out_unlock_session;
2874 }
1946 2875
1947 if (!already_completed) { 2876 if (done_request(req, &m)) {
1948 if (req->r_unsafe_callback && 2877 __finish_request(req);
1949 result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK)) 2878 if (req->r_linger) {
1950 req->r_unsafe_callback(req, true); 2879 WARN_ON(req->r_unsafe_callback);
1951 if (req->r_callback) 2880 dout("req %p tid %llu cb (locked)\n", req, req->r_tid);
1952 req->r_callback(req, msg); 2881 __complete_request(req);
1953 else 2882 }
1954 complete_all(&req->r_completion);
1955 } 2883 }
1956 2884
1957 if (flags & CEPH_OSD_FLAG_ONDISK) { 2885 mutex_unlock(&osd->lock);
1958 if (req->r_unsafe_callback && already_completed) 2886 up_read(&osdc->lock);
2887
2888 if (done_request(req, &m)) {
2889 if (already_acked && req->r_unsafe_callback) {
2890 dout("req %p tid %llu safe-cb\n", req, req->r_tid);
1959 req->r_unsafe_callback(req, false); 2891 req->r_unsafe_callback(req, false);
1960 complete_request(req); 2892 } else if (!req->r_linger) {
2893 dout("req %p tid %llu cb\n", req, req->r_tid);
2894 __complete_request(req);
2895 }
2896 } else {
2897 if (req->r_unsafe_callback) {
2898 dout("req %p tid %llu unsafe-cb\n", req, req->r_tid);
2899 req->r_unsafe_callback(req, true);
2900 } else {
2901 WARN_ON(1);
2902 }
1961 } 2903 }
2904 if (m.flags & CEPH_OSD_FLAG_ONDISK)
2905 complete_all(&req->r_safe_completion);
1962 2906
1963out:
1964 dout("req=%p req->r_linger=%d\n", req, req->r_linger);
1965 ceph_osdc_put_request(req); 2907 ceph_osdc_put_request(req);
1966 return; 2908 return;
1967out_unlock:
1968 mutex_unlock(&osdc->request_mutex);
1969 up_read(&osdc->map_sem);
1970 goto out;
1971 2909
1972bad_put: 2910fail_request:
1973 req->r_result = -EIO; 2911 complete_request(req, -EIO);
1974 __unregister_request(osdc, req); 2912out_unlock_session:
1975 if (req->r_callback) 2913 mutex_unlock(&osd->lock);
1976 req->r_callback(req, msg); 2914out_unlock_osdc:
1977 else 2915 up_read(&osdc->lock);
1978 complete_all(&req->r_completion);
1979 complete_request(req);
1980 ceph_osdc_put_request(req);
1981bad_mutex:
1982 mutex_unlock(&osdc->request_mutex);
1983 up_read(&osdc->map_sem);
1984bad:
1985 pr_err("corrupt osd_op_reply got %d %d\n",
1986 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
1987 ceph_msg_dump(msg);
1988} 2916}
1989 2917
1990static void reset_changed_osds(struct ceph_osd_client *osdc) 2918static void set_pool_was_full(struct ceph_osd_client *osdc)
1991{ 2919{
1992 struct rb_node *p, *n; 2920 struct rb_node *n;
1993 2921
1994 dout("%s %p\n", __func__, osdc); 2922 for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) {
1995 for (p = rb_first(&osdc->osds); p; p = n) { 2923 struct ceph_pg_pool_info *pi =
1996 struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); 2924 rb_entry(n, struct ceph_pg_pool_info, node);
1997 2925
1998 n = rb_next(p); 2926 pi->was_full = __pool_full(pi);
1999 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
2000 memcmp(&osd->o_con.peer_addr,
2001 ceph_osd_addr(osdc->osdmap,
2002 osd->o_osd),
2003 sizeof(struct ceph_entity_addr)) != 0)
2004 __reset_osd(osdc, osd);
2005 } 2927 }
2006} 2928}
2007 2929
2930static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id)
2931{
2932 struct ceph_pg_pool_info *pi;
2933
2934 pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id);
2935 if (!pi)
2936 return false;
2937
2938 return pi->was_full && !__pool_full(pi);
2939}
2940
2941static enum calc_target_result
2942recalc_linger_target(struct ceph_osd_linger_request *lreq)
2943{
2944 struct ceph_osd_client *osdc = lreq->osdc;
2945 enum calc_target_result ct_res;
2946
2947 ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true);
2948 if (ct_res == CALC_TARGET_NEED_RESEND) {
2949 struct ceph_osd *osd;
2950
2951 osd = lookup_create_osd(osdc, lreq->t.osd, true);
2952 if (osd != lreq->osd) {
2953 unlink_linger(lreq->osd, lreq);
2954 link_linger(osd, lreq);
2955 }
2956 }
2957
2958 return ct_res;
2959}
2960
2008/* 2961/*
2009 * Requeue requests whose mapping to an OSD has changed. If requests map to 2962 * Requeue requests whose mapping to an OSD has changed.
2010 * no osd, request a new map.
2011 *
2012 * Caller should hold map_sem for read.
2013 */ 2963 */
2014static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, 2964static void scan_requests(struct ceph_osd *osd,
2015 bool force_resend_writes) 2965 bool force_resend,
2966 bool cleared_full,
2967 bool check_pool_cleared_full,
2968 struct rb_root *need_resend,
2969 struct list_head *need_resend_linger)
2016{ 2970{
2017 struct ceph_osd_request *req, *nreq; 2971 struct ceph_osd_client *osdc = osd->o_osdc;
2018 struct rb_node *p; 2972 struct rb_node *n;
2019 int needmap = 0; 2973 bool force_resend_writes;
2020 int err; 2974
2021 bool force_resend_req; 2975 for (n = rb_first(&osd->o_linger_requests); n; ) {
2976 struct ceph_osd_linger_request *lreq =
2977 rb_entry(n, struct ceph_osd_linger_request, node);
2978 enum calc_target_result ct_res;
2979
2980 n = rb_next(n); /* recalc_linger_target() */
2981
2982 dout("%s lreq %p linger_id %llu\n", __func__, lreq,
2983 lreq->linger_id);
2984 ct_res = recalc_linger_target(lreq);
2985 switch (ct_res) {
2986 case CALC_TARGET_NO_ACTION:
2987 force_resend_writes = cleared_full ||
2988 (check_pool_cleared_full &&
2989 pool_cleared_full(osdc, lreq->t.base_oloc.pool));
2990 if (!force_resend && !force_resend_writes)
2991 break;
2992
2993 /* fall through */
2994 case CALC_TARGET_NEED_RESEND:
2995 cancel_linger_map_check(lreq);
2996 /*
2997 * scan_requests() for the previous epoch(s)
2998 * may have already added it to the list, since
2999 * it's not unlinked here.
3000 */
3001 if (list_empty(&lreq->scan_item))
3002 list_add_tail(&lreq->scan_item, need_resend_linger);
3003 break;
3004 case CALC_TARGET_POOL_DNE:
3005 check_linger_pool_dne(lreq);
3006 break;
3007 }
3008 }
2022 3009
2023 dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "", 3010 for (n = rb_first(&osd->o_requests); n; ) {
2024 force_resend_writes ? " (force resend writes)" : ""); 3011 struct ceph_osd_request *req =
2025 mutex_lock(&osdc->request_mutex); 3012 rb_entry(n, struct ceph_osd_request, r_node);
2026 for (p = rb_first(&osdc->requests); p; ) { 3013 enum calc_target_result ct_res;
2027 req = rb_entry(p, struct ceph_osd_request, r_node); 3014
2028 p = rb_next(p); 3015 n = rb_next(n); /* unlink_request(), check_pool_dne() */
3016
3017 dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
3018 ct_res = calc_target(osdc, &req->r_t,
3019 &req->r_last_force_resend, false);
3020 switch (ct_res) {
3021 case CALC_TARGET_NO_ACTION:
3022 force_resend_writes = cleared_full ||
3023 (check_pool_cleared_full &&
3024 pool_cleared_full(osdc, req->r_t.base_oloc.pool));
3025 if (!force_resend &&
3026 (!(req->r_flags & CEPH_OSD_FLAG_WRITE) ||
3027 !force_resend_writes))
3028 break;
3029
3030 /* fall through */
3031 case CALC_TARGET_NEED_RESEND:
3032 cancel_map_check(req);
3033 unlink_request(osd, req);
3034 insert_request(need_resend, req);
3035 break;
3036 case CALC_TARGET_POOL_DNE:
3037 check_pool_dne(req);
3038 break;
3039 }
3040 }
3041}
2029 3042
3043static int handle_one_map(struct ceph_osd_client *osdc,
3044 void *p, void *end, bool incremental,
3045 struct rb_root *need_resend,
3046 struct list_head *need_resend_linger)
3047{
3048 struct ceph_osdmap *newmap;
3049 struct rb_node *n;
3050 bool skipped_map = false;
3051 bool was_full;
3052
3053 was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
3054 set_pool_was_full(osdc);
3055
3056 if (incremental)
3057 newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
3058 else
3059 newmap = ceph_osdmap_decode(&p, end);
3060 if (IS_ERR(newmap))
3061 return PTR_ERR(newmap);
3062
3063 if (newmap != osdc->osdmap) {
2030 /* 3064 /*
2031 * For linger requests that have not yet been 3065 * Preserve ->was_full before destroying the old map.
2032 * registered, move them to the linger list; they'll 3066 * For pools that weren't in the old map, ->was_full
2033 * be sent to the osd in the loop below. Unregister 3067 * should be false.
2034 * the request before re-registering it as a linger
2035 * request to ensure the __map_request() below
2036 * will decide it needs to be sent.
2037 */ 3068 */
2038 if (req->r_linger && list_empty(&req->r_linger_item)) { 3069 for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) {
2039 dout("%p tid %llu restart on osd%d\n", 3070 struct ceph_pg_pool_info *pi =
2040 req, req->r_tid, 3071 rb_entry(n, struct ceph_pg_pool_info, node);
2041 req->r_osd ? req->r_osd->o_osd : -1); 3072 struct ceph_pg_pool_info *old_pi;
2042 ceph_osdc_get_request(req); 3073
2043 __unregister_request(osdc, req); 3074 old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id);
2044 __register_linger_request(osdc, req); 3075 if (old_pi)
2045 ceph_osdc_put_request(req); 3076 pi->was_full = old_pi->was_full;
2046 continue; 3077 else
3078 WARN_ON(pi->was_full);
2047 } 3079 }
2048 3080
2049 force_resend_req = force_resend || 3081 if (osdc->osdmap->epoch &&
2050 (force_resend_writes && 3082 osdc->osdmap->epoch + 1 < newmap->epoch) {
2051 req->r_flags & CEPH_OSD_FLAG_WRITE); 3083 WARN_ON(incremental);
2052 err = __map_request(osdc, req, force_resend_req); 3084 skipped_map = true;
2053 if (err < 0)
2054 continue; /* error */
2055 if (req->r_osd == NULL) {
2056 dout("%p tid %llu maps to no osd\n", req, req->r_tid);
2057 needmap++; /* request a newer map */
2058 } else if (err > 0) {
2059 if (!req->r_linger) {
2060 dout("%p tid %llu requeued on osd%d\n", req,
2061 req->r_tid,
2062 req->r_osd ? req->r_osd->o_osd : -1);
2063 req->r_flags |= CEPH_OSD_FLAG_RETRY;
2064 }
2065 } 3085 }
3086
3087 ceph_osdmap_destroy(osdc->osdmap);
3088 osdc->osdmap = newmap;
2066 } 3089 }
2067 3090
2068 list_for_each_entry_safe(req, nreq, &osdc->req_linger, 3091 was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
2069 r_linger_item) { 3092 scan_requests(&osdc->homeless_osd, skipped_map, was_full, true,
2070 dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); 3093 need_resend, need_resend_linger);
2071 3094
2072 err = __map_request(osdc, req, 3095 for (n = rb_first(&osdc->osds); n; ) {
2073 force_resend || force_resend_writes); 3096 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
2074 dout("__map_request returned %d\n", err); 3097
2075 if (err < 0) 3098 n = rb_next(n); /* close_osd() */
2076 continue; /* hrm! */ 3099
2077 if (req->r_osd == NULL || err > 0) { 3100 scan_requests(osd, skipped_map, was_full, true, need_resend,
2078 if (req->r_osd == NULL) { 3101 need_resend_linger);
2079 dout("lingering %p tid %llu maps to no osd\n", 3102 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
2080 req, req->r_tid); 3103 memcmp(&osd->o_con.peer_addr,
2081 /* 3104 ceph_osd_addr(osdc->osdmap, osd->o_osd),
2082 * A homeless lingering request makes 3105 sizeof(struct ceph_entity_addr)))
2083 * no sense, as it's job is to keep 3106 close_osd(osd);
2084 * a particular OSD connection open. 3107 }
2085 * Request a newer map and kick the
2086 * request, knowing that it won't be
2087 * resent until we actually get a map
2088 * that can tell us where to send it.
2089 */
2090 needmap++;
2091 }
2092 3108
2093 dout("kicking lingering %p tid %llu osd%d\n", req, 3109 return 0;
2094 req->r_tid, req->r_osd ? req->r_osd->o_osd : -1); 3110}
2095 __register_request(osdc, req); 3111
2096 __unregister_linger_request(osdc, req); 3112static void kick_requests(struct ceph_osd_client *osdc,
3113 struct rb_root *need_resend,
3114 struct list_head *need_resend_linger)
3115{
3116 struct ceph_osd_linger_request *lreq, *nlreq;
3117 struct rb_node *n;
3118
3119 for (n = rb_first(need_resend); n; ) {
3120 struct ceph_osd_request *req =
3121 rb_entry(n, struct ceph_osd_request, r_node);
3122 struct ceph_osd *osd;
3123
3124 n = rb_next(n);
3125 erase_request(need_resend, req); /* before link_request() */
3126
3127 WARN_ON(req->r_osd);
3128 calc_target(osdc, &req->r_t, NULL, false);
3129 osd = lookup_create_osd(osdc, req->r_t.osd, true);
3130 link_request(osd, req);
3131 if (!req->r_linger) {
3132 if (!osd_homeless(osd) && !req->r_t.paused)
3133 send_request(req);
3134 } else {
3135 cancel_linger_request(req);
2097 } 3136 }
2098 } 3137 }
2099 reset_changed_osds(osdc);
2100 mutex_unlock(&osdc->request_mutex);
2101 3138
2102 if (needmap) { 3139 list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) {
2103 dout("%d requests for down osds, need new map\n", needmap); 3140 if (!osd_homeless(lreq->osd))
2104 ceph_monc_request_next_osdmap(&osdc->client->monc); 3141 send_linger(lreq);
3142
3143 list_del_init(&lreq->scan_item);
2105 } 3144 }
2106} 3145}
2107 3146
2108
2109/* 3147/*
2110 * Process updated osd map. 3148 * Process updated osd map.
2111 * 3149 *
@@ -2115,27 +3153,31 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
2115 */ 3153 */
2116void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) 3154void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
2117{ 3155{
2118 void *p, *end, *next; 3156 void *p = msg->front.iov_base;
3157 void *const end = p + msg->front.iov_len;
2119 u32 nr_maps, maplen; 3158 u32 nr_maps, maplen;
2120 u32 epoch; 3159 u32 epoch;
2121 struct ceph_osdmap *newmap = NULL, *oldmap;
2122 int err;
2123 struct ceph_fsid fsid; 3160 struct ceph_fsid fsid;
2124 bool was_full; 3161 struct rb_root need_resend = RB_ROOT;
3162 LIST_HEAD(need_resend_linger);
3163 bool handled_incremental = false;
3164 bool was_pauserd, was_pausewr;
3165 bool pauserd, pausewr;
3166 int err;
2125 3167
2126 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); 3168 dout("%s have %u\n", __func__, osdc->osdmap->epoch);
2127 p = msg->front.iov_base; 3169 down_write(&osdc->lock);
2128 end = p + msg->front.iov_len;
2129 3170
2130 /* verify fsid */ 3171 /* verify fsid */
2131 ceph_decode_need(&p, end, sizeof(fsid), bad); 3172 ceph_decode_need(&p, end, sizeof(fsid), bad);
2132 ceph_decode_copy(&p, &fsid, sizeof(fsid)); 3173 ceph_decode_copy(&p, &fsid, sizeof(fsid));
2133 if (ceph_check_fsid(osdc->client, &fsid) < 0) 3174 if (ceph_check_fsid(osdc->client, &fsid) < 0)
2134 return; 3175 goto bad;
2135 3176
2136 down_write(&osdc->map_sem); 3177 was_pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
2137 3178 was_pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
2138 was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); 3179 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
3180 have_pool_full(osdc);
2139 3181
2140 /* incremental maps */ 3182 /* incremental maps */
2141 ceph_decode_32_safe(&p, end, nr_maps, bad); 3183 ceph_decode_32_safe(&p, end, nr_maps, bad);
@@ -2145,34 +3187,23 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
2145 epoch = ceph_decode_32(&p); 3187 epoch = ceph_decode_32(&p);
2146 maplen = ceph_decode_32(&p); 3188 maplen = ceph_decode_32(&p);
2147 ceph_decode_need(&p, end, maplen, bad); 3189 ceph_decode_need(&p, end, maplen, bad);
2148 next = p + maplen; 3190 if (osdc->osdmap->epoch &&
2149 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) { 3191 osdc->osdmap->epoch + 1 == epoch) {
2150 dout("applying incremental map %u len %d\n", 3192 dout("applying incremental map %u len %d\n",
2151 epoch, maplen); 3193 epoch, maplen);
2152 newmap = osdmap_apply_incremental(&p, next, 3194 err = handle_one_map(osdc, p, p + maplen, true,
2153 osdc->osdmap, 3195 &need_resend, &need_resend_linger);
2154 &osdc->client->msgr); 3196 if (err)
2155 if (IS_ERR(newmap)) {
2156 err = PTR_ERR(newmap);
2157 goto bad; 3197 goto bad;
2158 } 3198 handled_incremental = true;
2159 BUG_ON(!newmap);
2160 if (newmap != osdc->osdmap) {
2161 ceph_osdmap_destroy(osdc->osdmap);
2162 osdc->osdmap = newmap;
2163 }
2164 was_full = was_full ||
2165 ceph_osdmap_flag(osdc->osdmap,
2166 CEPH_OSDMAP_FULL);
2167 kick_requests(osdc, 0, was_full);
2168 } else { 3199 } else {
2169 dout("ignoring incremental map %u len %d\n", 3200 dout("ignoring incremental map %u len %d\n",
2170 epoch, maplen); 3201 epoch, maplen);
2171 } 3202 }
2172 p = next; 3203 p += maplen;
2173 nr_maps--; 3204 nr_maps--;
2174 } 3205 }
2175 if (newmap) 3206 if (handled_incremental)
2176 goto done; 3207 goto done;
2177 3208
2178 /* full maps */ 3209 /* full maps */
@@ -2186,455 +3217,647 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
2186 if (nr_maps > 1) { 3217 if (nr_maps > 1) {
2187 dout("skipping non-latest full map %u len %d\n", 3218 dout("skipping non-latest full map %u len %d\n",
2188 epoch, maplen); 3219 epoch, maplen);
2189 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) { 3220 } else if (osdc->osdmap->epoch >= epoch) {
2190 dout("skipping full map %u len %d, " 3221 dout("skipping full map %u len %d, "
2191 "older than our %u\n", epoch, maplen, 3222 "older than our %u\n", epoch, maplen,
2192 osdc->osdmap->epoch); 3223 osdc->osdmap->epoch);
2193 } else { 3224 } else {
2194 int skipped_map = 0;
2195
2196 dout("taking full map %u len %d\n", epoch, maplen); 3225 dout("taking full map %u len %d\n", epoch, maplen);
2197 newmap = ceph_osdmap_decode(&p, p+maplen); 3226 err = handle_one_map(osdc, p, p + maplen, false,
2198 if (IS_ERR(newmap)) { 3227 &need_resend, &need_resend_linger);
2199 err = PTR_ERR(newmap); 3228 if (err)
2200 goto bad; 3229 goto bad;
2201 }
2202 BUG_ON(!newmap);
2203 oldmap = osdc->osdmap;
2204 osdc->osdmap = newmap;
2205 if (oldmap) {
2206 if (oldmap->epoch + 1 < newmap->epoch)
2207 skipped_map = 1;
2208 ceph_osdmap_destroy(oldmap);
2209 }
2210 was_full = was_full ||
2211 ceph_osdmap_flag(osdc->osdmap,
2212 CEPH_OSDMAP_FULL);
2213 kick_requests(osdc, skipped_map, was_full);
2214 } 3230 }
2215 p += maplen; 3231 p += maplen;
2216 nr_maps--; 3232 nr_maps--;
2217 } 3233 }
2218 3234
2219 if (!osdc->osdmap)
2220 goto bad;
2221done: 3235done:
2222 downgrade_write(&osdc->map_sem);
2223 ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
2224 osdc->osdmap->epoch);
2225
2226 /* 3236 /*
2227 * subscribe to subsequent osdmap updates if full to ensure 3237 * subscribe to subsequent osdmap updates if full to ensure
2228 * we find out when we are no longer full and stop returning 3238 * we find out when we are no longer full and stop returning
2229 * ENOSPC. 3239 * ENOSPC.
2230 */ 3240 */
2231 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || 3241 pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
2232 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || 3242 pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
2233 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) 3243 ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
2234 ceph_monc_request_next_osdmap(&osdc->client->monc); 3244 have_pool_full(osdc);
2235 3245 if (was_pauserd || was_pausewr || pauserd || pausewr)
2236 mutex_lock(&osdc->request_mutex); 3246 maybe_request_map(osdc);
2237 __send_queued(osdc); 3247
2238 mutex_unlock(&osdc->request_mutex); 3248 kick_requests(osdc, &need_resend, &need_resend_linger);
2239 up_read(&osdc->map_sem); 3249
3250 ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
3251 osdc->osdmap->epoch);
3252 up_write(&osdc->lock);
2240 wake_up_all(&osdc->client->auth_wq); 3253 wake_up_all(&osdc->client->auth_wq);
2241 return; 3254 return;
2242 3255
2243bad: 3256bad:
2244 pr_err("osdc handle_map corrupt msg\n"); 3257 pr_err("osdc handle_map corrupt msg\n");
2245 ceph_msg_dump(msg); 3258 ceph_msg_dump(msg);
2246 up_write(&osdc->map_sem); 3259 up_write(&osdc->lock);
2247} 3260}
2248 3261
2249/* 3262/*
2250 * watch/notify callback event infrastructure 3263 * Resubmit requests pending on the given osd.
2251 *
2252 * These callbacks are used both for watch and notify operations.
2253 */ 3264 */
2254static void __release_event(struct kref *kref) 3265static void kick_osd_requests(struct ceph_osd *osd)
2255{ 3266{
2256 struct ceph_osd_event *event = 3267 struct rb_node *n;
2257 container_of(kref, struct ceph_osd_event, kref);
2258 3268
2259 dout("__release_event %p\n", event); 3269 for (n = rb_first(&osd->o_requests); n; ) {
2260 kfree(event); 3270 struct ceph_osd_request *req =
2261} 3271 rb_entry(n, struct ceph_osd_request, r_node);
2262 3272
2263static void get_event(struct ceph_osd_event *event) 3273 n = rb_next(n); /* cancel_linger_request() */
2264{
2265 kref_get(&event->kref);
2266}
2267 3274
2268void ceph_osdc_put_event(struct ceph_osd_event *event) 3275 if (!req->r_linger) {
2269{ 3276 if (!req->r_t.paused)
2270 kref_put(&event->kref, __release_event); 3277 send_request(req);
3278 } else {
3279 cancel_linger_request(req);
3280 }
3281 }
3282 for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) {
3283 struct ceph_osd_linger_request *lreq =
3284 rb_entry(n, struct ceph_osd_linger_request, node);
3285
3286 send_linger(lreq);
3287 }
2271} 3288}
2272EXPORT_SYMBOL(ceph_osdc_put_event);
2273 3289
2274static void __insert_event(struct ceph_osd_client *osdc, 3290/*
2275 struct ceph_osd_event *new) 3291 * If the osd connection drops, we need to resubmit all requests.
3292 */
3293static void osd_fault(struct ceph_connection *con)
2276{ 3294{
2277 struct rb_node **p = &osdc->event_tree.rb_node; 3295 struct ceph_osd *osd = con->private;
2278 struct rb_node *parent = NULL; 3296 struct ceph_osd_client *osdc = osd->o_osdc;
2279 struct ceph_osd_event *event = NULL;
2280 3297
2281 while (*p) { 3298 dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd);
2282 parent = *p; 3299
2283 event = rb_entry(parent, struct ceph_osd_event, node); 3300 down_write(&osdc->lock);
2284 if (new->cookie < event->cookie) 3301 if (!osd_registered(osd)) {
2285 p = &(*p)->rb_left; 3302 dout("%s osd%d unknown\n", __func__, osd->o_osd);
2286 else if (new->cookie > event->cookie) 3303 goto out_unlock;
2287 p = &(*p)->rb_right;
2288 else
2289 BUG();
2290 } 3304 }
2291 3305
2292 rb_link_node(&new->node, parent, p); 3306 if (!reopen_osd(osd))
2293 rb_insert_color(&new->node, &osdc->event_tree); 3307 kick_osd_requests(osd);
3308 maybe_request_map(osdc);
3309
3310out_unlock:
3311 up_write(&osdc->lock);
2294} 3312}
2295 3313
2296static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, 3314/*
2297 u64 cookie) 3315 * Process osd watch notifications
3316 */
3317static void handle_watch_notify(struct ceph_osd_client *osdc,
3318 struct ceph_msg *msg)
2298{ 3319{
2299 struct rb_node **p = &osdc->event_tree.rb_node; 3320 void *p = msg->front.iov_base;
2300 struct rb_node *parent = NULL; 3321 void *const end = p + msg->front.iov_len;
2301 struct ceph_osd_event *event = NULL; 3322 struct ceph_osd_linger_request *lreq;
3323 struct linger_work *lwork;
3324 u8 proto_ver, opcode;
3325 u64 cookie, notify_id;
3326 u64 notifier_id = 0;
3327 s32 return_code = 0;
3328 void *payload = NULL;
3329 u32 payload_len = 0;
2302 3330
2303 while (*p) { 3331 ceph_decode_8_safe(&p, end, proto_ver, bad);
2304 parent = *p; 3332 ceph_decode_8_safe(&p, end, opcode, bad);
2305 event = rb_entry(parent, struct ceph_osd_event, node); 3333 ceph_decode_64_safe(&p, end, cookie, bad);
2306 if (cookie < event->cookie) 3334 p += 8; /* skip ver */
2307 p = &(*p)->rb_left; 3335 ceph_decode_64_safe(&p, end, notify_id, bad);
2308 else if (cookie > event->cookie) 3336
2309 p = &(*p)->rb_right; 3337 if (proto_ver >= 1) {
2310 else 3338 ceph_decode_32_safe(&p, end, payload_len, bad);
2311 return event; 3339 ceph_decode_need(&p, end, payload_len, bad);
3340 payload = p;
3341 p += payload_len;
2312 } 3342 }
2313 return NULL;
2314}
2315 3343
2316static void __remove_event(struct ceph_osd_event *event) 3344 if (le16_to_cpu(msg->hdr.version) >= 2)
2317{ 3345 ceph_decode_32_safe(&p, end, return_code, bad);
2318 struct ceph_osd_client *osdc = event->osdc;
2319 3346
2320 if (!RB_EMPTY_NODE(&event->node)) { 3347 if (le16_to_cpu(msg->hdr.version) >= 3)
2321 dout("__remove_event removed %p\n", event); 3348 ceph_decode_64_safe(&p, end, notifier_id, bad);
2322 rb_erase(&event->node, &osdc->event_tree); 3349
2323 ceph_osdc_put_event(event); 3350 down_read(&osdc->lock);
3351 lreq = lookup_linger_osdc(&osdc->linger_requests, cookie);
3352 if (!lreq) {
3353 dout("%s opcode %d cookie %llu dne\n", __func__, opcode,
3354 cookie);
3355 goto out_unlock_osdc;
3356 }
3357
3358 mutex_lock(&lreq->lock);
3359 dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__,
3360 opcode, cookie, lreq, lreq->is_watch);
3361 if (opcode == CEPH_WATCH_EVENT_DISCONNECT) {
3362 if (!lreq->last_error) {
3363 lreq->last_error = -ENOTCONN;
3364 queue_watch_error(lreq);
3365 }
3366 } else if (!lreq->is_watch) {
3367 /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */
3368 if (lreq->notify_id && lreq->notify_id != notify_id) {
3369 dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq,
3370 lreq->notify_id, notify_id);
3371 } else if (!completion_done(&lreq->notify_finish_wait)) {
3372 struct ceph_msg_data *data =
3373 list_first_entry_or_null(&msg->data,
3374 struct ceph_msg_data,
3375 links);
3376
3377 if (data) {
3378 if (lreq->preply_pages) {
3379 WARN_ON(data->type !=
3380 CEPH_MSG_DATA_PAGES);
3381 *lreq->preply_pages = data->pages;
3382 *lreq->preply_len = data->length;
3383 } else {
3384 ceph_release_page_vector(data->pages,
3385 calc_pages_for(0, data->length));
3386 }
3387 }
3388 lreq->notify_finish_error = return_code;
3389 complete_all(&lreq->notify_finish_wait);
3390 }
2324 } else { 3391 } else {
2325 dout("__remove_event didn't remove %p\n", event); 3392 /* CEPH_WATCH_EVENT_NOTIFY */
3393 lwork = lwork_alloc(lreq, do_watch_notify);
3394 if (!lwork) {
3395 pr_err("failed to allocate notify-lwork\n");
3396 goto out_unlock_lreq;
3397 }
3398
3399 lwork->notify.notify_id = notify_id;
3400 lwork->notify.notifier_id = notifier_id;
3401 lwork->notify.payload = payload;
3402 lwork->notify.payload_len = payload_len;
3403 lwork->notify.msg = ceph_msg_get(msg);
3404 lwork_queue(lwork);
2326 } 3405 }
3406
3407out_unlock_lreq:
3408 mutex_unlock(&lreq->lock);
3409out_unlock_osdc:
3410 up_read(&osdc->lock);
3411 return;
3412
3413bad:
3414 pr_err("osdc handle_watch_notify corrupt msg\n");
2327} 3415}
2328 3416
2329int ceph_osdc_create_event(struct ceph_osd_client *osdc, 3417/*
2330 void (*event_cb)(u64, u64, u8, void *), 3418 * Register request, send initial attempt.
2331 void *data, struct ceph_osd_event **pevent) 3419 */
3420int ceph_osdc_start_request(struct ceph_osd_client *osdc,
3421 struct ceph_osd_request *req,
3422 bool nofail)
2332{ 3423{
2333 struct ceph_osd_event *event; 3424 down_read(&osdc->lock);
2334 3425 submit_request(req, false);
2335 event = kmalloc(sizeof(*event), GFP_NOIO); 3426 up_read(&osdc->lock);
2336 if (!event)
2337 return -ENOMEM;
2338 3427
2339 dout("create_event %p\n", event);
2340 event->cb = event_cb;
2341 event->one_shot = 0;
2342 event->data = data;
2343 event->osdc = osdc;
2344 INIT_LIST_HEAD(&event->osd_node);
2345 RB_CLEAR_NODE(&event->node);
2346 kref_init(&event->kref); /* one ref for us */
2347 kref_get(&event->kref); /* one ref for the caller */
2348
2349 spin_lock(&osdc->event_lock);
2350 event->cookie = ++osdc->event_count;
2351 __insert_event(osdc, event);
2352 spin_unlock(&osdc->event_lock);
2353
2354 *pevent = event;
2355 return 0; 3428 return 0;
2356} 3429}
2357EXPORT_SYMBOL(ceph_osdc_create_event); 3430EXPORT_SYMBOL(ceph_osdc_start_request);
2358 3431
2359void ceph_osdc_cancel_event(struct ceph_osd_event *event) 3432/*
3433 * Unregister a registered request. The request is not completed (i.e.
3434 * no callbacks or wakeups) - higher layers are supposed to know what
3435 * they are canceling.
3436 */
3437void ceph_osdc_cancel_request(struct ceph_osd_request *req)
2360{ 3438{
2361 struct ceph_osd_client *osdc = event->osdc; 3439 struct ceph_osd_client *osdc = req->r_osdc;
2362 3440
2363 dout("cancel_event %p\n", event); 3441 down_write(&osdc->lock);
2364 spin_lock(&osdc->event_lock); 3442 if (req->r_osd)
2365 __remove_event(event); 3443 cancel_request(req);
2366 spin_unlock(&osdc->event_lock); 3444 up_write(&osdc->lock);
2367 ceph_osdc_put_event(event); /* caller's */
2368} 3445}
2369EXPORT_SYMBOL(ceph_osdc_cancel_event); 3446EXPORT_SYMBOL(ceph_osdc_cancel_request);
2370
2371 3447
2372static void do_event_work(struct work_struct *work) 3448/*
3449 * @timeout: in jiffies, 0 means "wait forever"
3450 */
3451static int wait_request_timeout(struct ceph_osd_request *req,
3452 unsigned long timeout)
2373{ 3453{
2374 struct ceph_osd_event_work *event_work = 3454 long left;
2375 container_of(work, struct ceph_osd_event_work, work);
2376 struct ceph_osd_event *event = event_work->event;
2377 u64 ver = event_work->ver;
2378 u64 notify_id = event_work->notify_id;
2379 u8 opcode = event_work->opcode;
2380 3455
2381 dout("do_event_work completing %p\n", event); 3456 dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
2382 event->cb(ver, notify_id, opcode, event->data); 3457 left = wait_for_completion_killable_timeout(&req->r_completion,
2383 dout("do_event_work completed %p\n", event); 3458 ceph_timeout_jiffies(timeout));
2384 ceph_osdc_put_event(event); 3459 if (left <= 0) {
2385 kfree(event_work); 3460 left = left ?: -ETIMEDOUT;
3461 ceph_osdc_cancel_request(req);
3462
3463 /* kludge - need to to wake ceph_osdc_sync() */
3464 complete_all(&req->r_safe_completion);
3465 } else {
3466 left = req->r_result; /* completed */
3467 }
3468
3469 return left;
2386} 3470}
2387 3471
3472/*
3473 * wait for a request to complete
3474 */
3475int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
3476 struct ceph_osd_request *req)
3477{
3478 return wait_request_timeout(req, 0);
3479}
3480EXPORT_SYMBOL(ceph_osdc_wait_request);
2388 3481
2389/* 3482/*
2390 * Process osd watch notifications 3483 * sync - wait for all in-flight requests to flush. avoid starvation.
2391 */ 3484 */
2392static void handle_watch_notify(struct ceph_osd_client *osdc, 3485void ceph_osdc_sync(struct ceph_osd_client *osdc)
2393 struct ceph_msg *msg)
2394{ 3486{
2395 void *p, *end; 3487 struct rb_node *n, *p;
2396 u8 proto_ver; 3488 u64 last_tid = atomic64_read(&osdc->last_tid);
2397 u64 cookie, ver, notify_id;
2398 u8 opcode;
2399 struct ceph_osd_event *event;
2400 struct ceph_osd_event_work *event_work;
2401 3489
2402 p = msg->front.iov_base; 3490again:
2403 end = p + msg->front.iov_len; 3491 down_read(&osdc->lock);
3492 for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
3493 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
2404 3494
2405 ceph_decode_8_safe(&p, end, proto_ver, bad); 3495 mutex_lock(&osd->lock);
2406 ceph_decode_8_safe(&p, end, opcode, bad); 3496 for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
2407 ceph_decode_64_safe(&p, end, cookie, bad); 3497 struct ceph_osd_request *req =
2408 ceph_decode_64_safe(&p, end, ver, bad); 3498 rb_entry(p, struct ceph_osd_request, r_node);
2409 ceph_decode_64_safe(&p, end, notify_id, bad); 3499
3500 if (req->r_tid > last_tid)
3501 break;
3502
3503 if (!(req->r_flags & CEPH_OSD_FLAG_WRITE))
3504 continue;
2410 3505
2411 spin_lock(&osdc->event_lock); 3506 ceph_osdc_get_request(req);
2412 event = __find_event(osdc, cookie); 3507 mutex_unlock(&osd->lock);
2413 if (event) { 3508 up_read(&osdc->lock);
2414 BUG_ON(event->one_shot); 3509 dout("%s waiting on req %p tid %llu last_tid %llu\n",
2415 get_event(event); 3510 __func__, req, req->r_tid, last_tid);
2416 } 3511 wait_for_completion(&req->r_safe_completion);
2417 spin_unlock(&osdc->event_lock); 3512 ceph_osdc_put_request(req);
2418 dout("handle_watch_notify cookie %lld ver %lld event %p\n", 3513 goto again;
2419 cookie, ver, event);
2420 if (event) {
2421 event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
2422 if (!event_work) {
2423 pr_err("couldn't allocate event_work\n");
2424 ceph_osdc_put_event(event);
2425 return;
2426 } 3514 }
2427 INIT_WORK(&event_work->work, do_event_work);
2428 event_work->event = event;
2429 event_work->ver = ver;
2430 event_work->notify_id = notify_id;
2431 event_work->opcode = opcode;
2432 3515
2433 queue_work(osdc->notify_wq, &event_work->work); 3516 mutex_unlock(&osd->lock);
2434 } 3517 }
2435 3518
2436 return; 3519 up_read(&osdc->lock);
3520 dout("%s done last_tid %llu\n", __func__, last_tid);
3521}
3522EXPORT_SYMBOL(ceph_osdc_sync);
2437 3523
2438bad: 3524static struct ceph_osd_request *
2439 pr_err("osdc handle_watch_notify corrupt msg\n"); 3525alloc_linger_request(struct ceph_osd_linger_request *lreq)
3526{
3527 struct ceph_osd_request *req;
3528
3529 req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
3530 if (!req)
3531 return NULL;
3532
3533 ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
3534 ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
3535
3536 if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
3537 ceph_osdc_put_request(req);
3538 return NULL;
3539 }
3540
3541 return req;
2440} 3542}
2441 3543
2442/* 3544/*
2443 * build new request AND message 3545 * Returns a handle, caller owns a ref.
2444 *
2445 */ 3546 */
2446void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, 3547struct ceph_osd_linger_request *
2447 struct ceph_snap_context *snapc, u64 snap_id, 3548ceph_osdc_watch(struct ceph_osd_client *osdc,
2448 struct timespec *mtime) 3549 struct ceph_object_id *oid,
2449{ 3550 struct ceph_object_locator *oloc,
2450 struct ceph_msg *msg = req->r_request; 3551 rados_watchcb2_t wcb,
2451 void *p; 3552 rados_watcherrcb_t errcb,
2452 size_t msg_size; 3553 void *data)
2453 int flags = req->r_flags; 3554{
2454 u64 data_len; 3555 struct ceph_osd_linger_request *lreq;
2455 unsigned int i; 3556 int ret;
2456
2457 req->r_snapid = snap_id;
2458 req->r_snapc = ceph_get_snap_context(snapc);
2459
2460 /* encode request */
2461 msg->hdr.version = cpu_to_le16(4);
2462
2463 p = msg->front.iov_base;
2464 ceph_encode_32(&p, 1); /* client_inc is always 1 */
2465 req->r_request_osdmap_epoch = p;
2466 p += 4;
2467 req->r_request_flags = p;
2468 p += 4;
2469 if (req->r_flags & CEPH_OSD_FLAG_WRITE)
2470 ceph_encode_timespec(p, mtime);
2471 p += sizeof(struct ceph_timespec);
2472 req->r_request_reassert_version = p;
2473 p += sizeof(struct ceph_eversion); /* will get filled in */
2474
2475 /* oloc */
2476 ceph_encode_8(&p, 4);
2477 ceph_encode_8(&p, 4);
2478 ceph_encode_32(&p, 8 + 4 + 4);
2479 req->r_request_pool = p;
2480 p += 8;
2481 ceph_encode_32(&p, -1); /* preferred */
2482 ceph_encode_32(&p, 0); /* key len */
2483 3557
2484 ceph_encode_8(&p, 1); 3558 lreq = linger_alloc(osdc);
2485 req->r_request_pgid = p; 3559 if (!lreq)
2486 p += 8 + 4; 3560 return ERR_PTR(-ENOMEM);
2487 ceph_encode_32(&p, -1); /* preferred */
2488 3561
2489 /* oid */ 3562 lreq->is_watch = true;
2490 ceph_encode_32(&p, req->r_base_oid.name_len); 3563 lreq->wcb = wcb;
2491 memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len); 3564 lreq->errcb = errcb;
2492 dout("oid '%.*s' len %d\n", req->r_base_oid.name_len, 3565 lreq->data = data;
2493 req->r_base_oid.name, req->r_base_oid.name_len); 3566 lreq->watch_valid_thru = jiffies;
2494 p += req->r_base_oid.name_len; 3567
2495 3568 ceph_oid_copy(&lreq->t.base_oid, oid);
2496 /* ops--can imply data */ 3569 ceph_oloc_copy(&lreq->t.base_oloc, oloc);
2497 ceph_encode_16(&p, (u16)req->r_num_ops); 3570 lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
2498 data_len = 0; 3571 lreq->mtime = CURRENT_TIME;
2499 for (i = 0; i < req->r_num_ops; i++) { 3572
2500 data_len += osd_req_encode_op(req, p, i); 3573 lreq->reg_req = alloc_linger_request(lreq);
2501 p += sizeof(struct ceph_osd_op); 3574 if (!lreq->reg_req) {
3575 ret = -ENOMEM;
3576 goto err_put_lreq;
2502 } 3577 }
2503 3578
2504 /* snaps */ 3579 lreq->ping_req = alloc_linger_request(lreq);
2505 ceph_encode_64(&p, req->r_snapid); 3580 if (!lreq->ping_req) {
2506 ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); 3581 ret = -ENOMEM;
2507 ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); 3582 goto err_put_lreq;
2508 if (req->r_snapc) {
2509 for (i = 0; i < snapc->num_snaps; i++) {
2510 ceph_encode_64(&p, req->r_snapc->snaps[i]);
2511 }
2512 } 3583 }
2513 3584
2514 req->r_request_attempts = p; 3585 down_write(&osdc->lock);
2515 p += 4; 3586 linger_register(lreq); /* before osd_req_op_* */
2516 3587 osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
2517 /* data */ 3588 CEPH_OSD_WATCH_OP_WATCH);
2518 if (flags & CEPH_OSD_FLAG_WRITE) { 3589 osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
2519 u16 data_off; 3590 CEPH_OSD_WATCH_OP_PING);
2520 3591 linger_submit(lreq);
2521 /* 3592 up_write(&osdc->lock);
2522 * The header "data_off" is a hint to the receiver 3593
2523 * allowing it to align received data into its 3594 ret = linger_reg_commit_wait(lreq);
2524 * buffers such that there's no need to re-copy 3595 if (ret) {
2525 * it before writing it to disk (direct I/O). 3596 linger_cancel(lreq);
2526 */ 3597 goto err_put_lreq;
2527 data_off = (u16) (off & 0xffff);
2528 req->r_request->hdr.data_off = cpu_to_le16(data_off);
2529 } 3598 }
2530 req->r_request->hdr.data_len = cpu_to_le32(data_len);
2531 3599
2532 BUG_ON(p > msg->front.iov_base + msg->front.iov_len); 3600 return lreq;
2533 msg_size = p - msg->front.iov_base;
2534 msg->front.iov_len = msg_size;
2535 msg->hdr.front_len = cpu_to_le32(msg_size);
2536 3601
2537 dout("build_request msg_size was %d\n", (int)msg_size); 3602err_put_lreq:
3603 linger_put(lreq);
3604 return ERR_PTR(ret);
2538} 3605}
2539EXPORT_SYMBOL(ceph_osdc_build_request); 3606EXPORT_SYMBOL(ceph_osdc_watch);
2540 3607
2541/* 3608/*
2542 * Register request, send initial attempt. 3609 * Releases a ref.
3610 *
3611 * Times out after mount_timeout to preserve rbd unmap behaviour
3612 * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap
3613 * with mount_timeout").
2543 */ 3614 */
2544int ceph_osdc_start_request(struct ceph_osd_client *osdc, 3615int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
2545 struct ceph_osd_request *req, 3616 struct ceph_osd_linger_request *lreq)
2546 bool nofail)
2547{ 3617{
2548 int rc; 3618 struct ceph_options *opts = osdc->client->options;
3619 struct ceph_osd_request *req;
3620 int ret;
2549 3621
2550 down_read(&osdc->map_sem); 3622 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
2551 mutex_lock(&osdc->request_mutex); 3623 if (!req)
3624 return -ENOMEM;
3625
3626 ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
3627 ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
3628 req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
3629 req->r_mtime = CURRENT_TIME;
3630 osd_req_op_watch_init(req, 0, lreq->linger_id,
3631 CEPH_OSD_WATCH_OP_UNWATCH);
2552 3632
2553 rc = __ceph_osdc_start_request(osdc, req, nofail); 3633 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
3634 if (ret)
3635 goto out_put_req;
2554 3636
2555 mutex_unlock(&osdc->request_mutex); 3637 ceph_osdc_start_request(osdc, req, false);
2556 up_read(&osdc->map_sem); 3638 linger_cancel(lreq);
3639 linger_put(lreq);
3640 ret = wait_request_timeout(req, opts->mount_timeout);
2557 3641
2558 return rc; 3642out_put_req:
3643 ceph_osdc_put_request(req);
3644 return ret;
2559} 3645}
2560EXPORT_SYMBOL(ceph_osdc_start_request); 3646EXPORT_SYMBOL(ceph_osdc_unwatch);
2561 3647
2562/* 3648static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
2563 * Unregister a registered request. The request is not completed (i.e. 3649 u64 notify_id, u64 cookie, void *payload,
2564 * no callbacks or wakeups) - higher layers are supposed to know what 3650 size_t payload_len)
2565 * they are canceling.
2566 */
2567void ceph_osdc_cancel_request(struct ceph_osd_request *req)
2568{ 3651{
2569 struct ceph_osd_client *osdc = req->r_osdc; 3652 struct ceph_osd_req_op *op;
3653 struct ceph_pagelist *pl;
3654 int ret;
2570 3655
2571 mutex_lock(&osdc->request_mutex); 3656 op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
2572 if (req->r_linger) 3657
2573 __unregister_linger_request(osdc, req); 3658 pl = kmalloc(sizeof(*pl), GFP_NOIO);
2574 __unregister_request(osdc, req); 3659 if (!pl)
2575 mutex_unlock(&osdc->request_mutex); 3660 return -ENOMEM;
3661
3662 ceph_pagelist_init(pl);
3663 ret = ceph_pagelist_encode_64(pl, notify_id);
3664 ret |= ceph_pagelist_encode_64(pl, cookie);
3665 if (payload) {
3666 ret |= ceph_pagelist_encode_32(pl, payload_len);
3667 ret |= ceph_pagelist_append(pl, payload, payload_len);
3668 } else {
3669 ret |= ceph_pagelist_encode_32(pl, 0);
3670 }
3671 if (ret) {
3672 ceph_pagelist_release(pl);
3673 return -ENOMEM;
3674 }
2576 3675
2577 dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid); 3676 ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl);
3677 op->indata_len = pl->length;
3678 return 0;
2578} 3679}
2579EXPORT_SYMBOL(ceph_osdc_cancel_request);
2580 3680
2581/* 3681int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
2582 * wait for a request to complete 3682 struct ceph_object_id *oid,
2583 */ 3683 struct ceph_object_locator *oloc,
2584int ceph_osdc_wait_request(struct ceph_osd_client *osdc, 3684 u64 notify_id,
2585 struct ceph_osd_request *req) 3685 u64 cookie,
3686 void *payload,
3687 size_t payload_len)
2586{ 3688{
2587 int rc; 3689 struct ceph_osd_request *req;
3690 int ret;
2588 3691
2589 dout("%s %p tid %llu\n", __func__, req, req->r_tid); 3692 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
3693 if (!req)
3694 return -ENOMEM;
2590 3695
2591 rc = wait_for_completion_interruptible(&req->r_completion); 3696 ceph_oid_copy(&req->r_base_oid, oid);
2592 if (rc < 0) { 3697 ceph_oloc_copy(&req->r_base_oloc, oloc);
2593 dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid); 3698 req->r_flags = CEPH_OSD_FLAG_READ;
2594 ceph_osdc_cancel_request(req); 3699
2595 complete_request(req); 3700 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2596 return rc; 3701 if (ret)
3702 goto out_put_req;
3703
3704 ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
3705 payload_len);
3706 if (ret)
3707 goto out_put_req;
3708
3709 ceph_osdc_start_request(osdc, req, false);
3710 ret = ceph_osdc_wait_request(osdc, req);
3711
3712out_put_req:
3713 ceph_osdc_put_request(req);
3714 return ret;
3715}
3716EXPORT_SYMBOL(ceph_osdc_notify_ack);
3717
3718static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
3719 u64 cookie, u32 prot_ver, u32 timeout,
3720 void *payload, size_t payload_len)
3721{
3722 struct ceph_osd_req_op *op;
3723 struct ceph_pagelist *pl;
3724 int ret;
3725
3726 op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
3727 op->notify.cookie = cookie;
3728
3729 pl = kmalloc(sizeof(*pl), GFP_NOIO);
3730 if (!pl)
3731 return -ENOMEM;
3732
3733 ceph_pagelist_init(pl);
3734 ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
3735 ret |= ceph_pagelist_encode_32(pl, timeout);
3736 ret |= ceph_pagelist_encode_32(pl, payload_len);
3737 ret |= ceph_pagelist_append(pl, payload, payload_len);
3738 if (ret) {
3739 ceph_pagelist_release(pl);
3740 return -ENOMEM;
2597 } 3741 }
2598 3742
2599 dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid, 3743 ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
2600 req->r_result); 3744 op->indata_len = pl->length;
2601 return req->r_result; 3745 return 0;
2602} 3746}
2603EXPORT_SYMBOL(ceph_osdc_wait_request);
2604 3747
2605/* 3748/*
2606 * sync - wait for all in-flight requests to flush. avoid starvation. 3749 * @timeout: in seconds
3750 *
3751 * @preply_{pages,len} are initialized both on success and error.
3752 * The caller is responsible for:
3753 *
3754 * ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len))
2607 */ 3755 */
2608void ceph_osdc_sync(struct ceph_osd_client *osdc) 3756int ceph_osdc_notify(struct ceph_osd_client *osdc,
3757 struct ceph_object_id *oid,
3758 struct ceph_object_locator *oloc,
3759 void *payload,
3760 size_t payload_len,
3761 u32 timeout,
3762 struct page ***preply_pages,
3763 size_t *preply_len)
2609{ 3764{
2610 struct ceph_osd_request *req; 3765 struct ceph_osd_linger_request *lreq;
2611 u64 last_tid, next_tid = 0; 3766 struct page **pages;
3767 int ret;
2612 3768
2613 mutex_lock(&osdc->request_mutex); 3769 WARN_ON(!timeout);
2614 last_tid = osdc->last_tid; 3770 if (preply_pages) {
2615 while (1) { 3771 *preply_pages = NULL;
2616 req = __lookup_request_ge(osdc, next_tid); 3772 *preply_len = 0;
2617 if (!req) 3773 }
2618 break;
2619 if (req->r_tid > last_tid)
2620 break;
2621 3774
2622 next_tid = req->r_tid + 1; 3775 lreq = linger_alloc(osdc);
2623 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0) 3776 if (!lreq)
2624 continue; 3777 return -ENOMEM;
2625 3778
2626 ceph_osdc_get_request(req); 3779 lreq->preply_pages = preply_pages;
2627 mutex_unlock(&osdc->request_mutex); 3780 lreq->preply_len = preply_len;
2628 dout("sync waiting on tid %llu (last is %llu)\n", 3781
2629 req->r_tid, last_tid); 3782 ceph_oid_copy(&lreq->t.base_oid, oid);
2630 wait_for_completion(&req->r_safe_completion); 3783 ceph_oloc_copy(&lreq->t.base_oloc, oloc);
2631 mutex_lock(&osdc->request_mutex); 3784 lreq->t.flags = CEPH_OSD_FLAG_READ;
2632 ceph_osdc_put_request(req); 3785
3786 lreq->reg_req = alloc_linger_request(lreq);
3787 if (!lreq->reg_req) {
3788 ret = -ENOMEM;
3789 goto out_put_lreq;
2633 } 3790 }
2634 mutex_unlock(&osdc->request_mutex); 3791
2635 dout("sync done (thru tid %llu)\n", last_tid); 3792 /* for notify_id */
3793 pages = ceph_alloc_page_vector(1, GFP_NOIO);
3794 if (IS_ERR(pages)) {
3795 ret = PTR_ERR(pages);
3796 goto out_put_lreq;
3797 }
3798
3799 down_write(&osdc->lock);
3800 linger_register(lreq); /* before osd_req_op_* */
3801 ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
3802 timeout, payload, payload_len);
3803 if (ret) {
3804 linger_unregister(lreq);
3805 up_write(&osdc->lock);
3806 ceph_release_page_vector(pages, 1);
3807 goto out_put_lreq;
3808 }
3809 ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
3810 response_data),
3811 pages, PAGE_SIZE, 0, false, true);
3812 linger_submit(lreq);
3813 up_write(&osdc->lock);
3814
3815 ret = linger_reg_commit_wait(lreq);
3816 if (!ret)
3817 ret = linger_notify_finish_wait(lreq);
3818 else
3819 dout("lreq %p failed to initiate notify %d\n", lreq, ret);
3820
3821 linger_cancel(lreq);
3822out_put_lreq:
3823 linger_put(lreq);
3824 return ret;
3825}
3826EXPORT_SYMBOL(ceph_osdc_notify);
3827
3828/*
3829 * Return the number of milliseconds since the watch was last
3830 * confirmed, or an error. If there is an error, the watch is no
3831 * longer valid, and should be destroyed with ceph_osdc_unwatch().
3832 */
3833int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
3834 struct ceph_osd_linger_request *lreq)
3835{
3836 unsigned long stamp, age;
3837 int ret;
3838
3839 down_read(&osdc->lock);
3840 mutex_lock(&lreq->lock);
3841 stamp = lreq->watch_valid_thru;
3842 if (!list_empty(&lreq->pending_lworks)) {
3843 struct linger_work *lwork =
3844 list_first_entry(&lreq->pending_lworks,
3845 struct linger_work,
3846 pending_item);
3847
3848 if (time_before(lwork->queued_stamp, stamp))
3849 stamp = lwork->queued_stamp;
3850 }
3851 age = jiffies - stamp;
3852 dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
3853 lreq, lreq->linger_id, age, lreq->last_error);
3854 /* we are truncating to msecs, so return a safe upper bound */
3855 ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
3856
3857 mutex_unlock(&lreq->lock);
3858 up_read(&osdc->lock);
3859 return ret;
2636} 3860}
2637EXPORT_SYMBOL(ceph_osdc_sync);
2638 3861
2639/* 3862/*
2640 * Call all pending notify callbacks - for use after a watch is 3863 * Call all pending notify callbacks - for use after a watch is
@@ -2646,6 +3869,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc)
2646} 3869}
2647EXPORT_SYMBOL(ceph_osdc_flush_notifies); 3870EXPORT_SYMBOL(ceph_osdc_flush_notifies);
2648 3871
3872void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc)
3873{
3874 down_read(&osdc->lock);
3875 maybe_request_map(osdc);
3876 up_read(&osdc->lock);
3877}
3878EXPORT_SYMBOL(ceph_osdc_maybe_request_map);
2649 3879
2650/* 3880/*
2651 * init, shutdown 3881 * init, shutdown
@@ -2656,43 +3886,35 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
2656 3886
2657 dout("init\n"); 3887 dout("init\n");
2658 osdc->client = client; 3888 osdc->client = client;
2659 osdc->osdmap = NULL; 3889 init_rwsem(&osdc->lock);
2660 init_rwsem(&osdc->map_sem);
2661 init_completion(&osdc->map_waiters);
2662 osdc->last_requested_map = 0;
2663 mutex_init(&osdc->request_mutex);
2664 osdc->last_tid = 0;
2665 osdc->osds = RB_ROOT; 3890 osdc->osds = RB_ROOT;
2666 INIT_LIST_HEAD(&osdc->osd_lru); 3891 INIT_LIST_HEAD(&osdc->osd_lru);
2667 osdc->requests = RB_ROOT; 3892 spin_lock_init(&osdc->osd_lru_lock);
2668 INIT_LIST_HEAD(&osdc->req_lru); 3893 osd_init(&osdc->homeless_osd);
2669 INIT_LIST_HEAD(&osdc->req_unsent); 3894 osdc->homeless_osd.o_osdc = osdc;
2670 INIT_LIST_HEAD(&osdc->req_notarget); 3895 osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD;
2671 INIT_LIST_HEAD(&osdc->req_linger); 3896 osdc->linger_requests = RB_ROOT;
2672 osdc->num_requests = 0; 3897 osdc->map_checks = RB_ROOT;
3898 osdc->linger_map_checks = RB_ROOT;
2673 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); 3899 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
2674 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); 3900 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
2675 spin_lock_init(&osdc->event_lock);
2676 osdc->event_tree = RB_ROOT;
2677 osdc->event_count = 0;
2678
2679 schedule_delayed_work(&osdc->osds_timeout_work,
2680 round_jiffies_relative(osdc->client->options->osd_idle_ttl));
2681 3901
2682 err = -ENOMEM; 3902 err = -ENOMEM;
3903 osdc->osdmap = ceph_osdmap_alloc();
3904 if (!osdc->osdmap)
3905 goto out;
3906
2683 osdc->req_mempool = mempool_create_slab_pool(10, 3907 osdc->req_mempool = mempool_create_slab_pool(10,
2684 ceph_osd_request_cache); 3908 ceph_osd_request_cache);
2685 if (!osdc->req_mempool) 3909 if (!osdc->req_mempool)
2686 goto out; 3910 goto out_map;
2687 3911
2688 err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, 3912 err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
2689 OSD_OP_FRONT_LEN, 10, true, 3913 PAGE_SIZE, 10, true, "osd_op");
2690 "osd_op");
2691 if (err < 0) 3914 if (err < 0)
2692 goto out_mempool; 3915 goto out_mempool;
2693 err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, 3916 err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
2694 OSD_OPREPLY_FRONT_LEN, 10, true, 3917 PAGE_SIZE, 10, true, "osd_op_reply");
2695 "osd_op_reply");
2696 if (err < 0) 3918 if (err < 0)
2697 goto out_msgpool; 3919 goto out_msgpool;
2698 3920
@@ -2701,6 +3923,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
2701 if (!osdc->notify_wq) 3923 if (!osdc->notify_wq)
2702 goto out_msgpool_reply; 3924 goto out_msgpool_reply;
2703 3925
3926 schedule_delayed_work(&osdc->timeout_work,
3927 osdc->client->options->osd_keepalive_timeout);
3928 schedule_delayed_work(&osdc->osds_timeout_work,
3929 round_jiffies_relative(osdc->client->options->osd_idle_ttl));
3930
2704 return 0; 3931 return 0;
2705 3932
2706out_msgpool_reply: 3933out_msgpool_reply:
@@ -2709,6 +3936,8 @@ out_msgpool:
2709 ceph_msgpool_destroy(&osdc->msgpool_op); 3936 ceph_msgpool_destroy(&osdc->msgpool_op);
2710out_mempool: 3937out_mempool:
2711 mempool_destroy(osdc->req_mempool); 3938 mempool_destroy(osdc->req_mempool);
3939out_map:
3940 ceph_osdmap_destroy(osdc->osdmap);
2712out: 3941out:
2713 return err; 3942 return err;
2714} 3943}
@@ -2719,11 +3948,25 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
2719 destroy_workqueue(osdc->notify_wq); 3948 destroy_workqueue(osdc->notify_wq);
2720 cancel_delayed_work_sync(&osdc->timeout_work); 3949 cancel_delayed_work_sync(&osdc->timeout_work);
2721 cancel_delayed_work_sync(&osdc->osds_timeout_work); 3950 cancel_delayed_work_sync(&osdc->osds_timeout_work);
2722 if (osdc->osdmap) { 3951
2723 ceph_osdmap_destroy(osdc->osdmap); 3952 down_write(&osdc->lock);
2724 osdc->osdmap = NULL; 3953 while (!RB_EMPTY_ROOT(&osdc->osds)) {
3954 struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
3955 struct ceph_osd, o_node);
3956 close_osd(osd);
2725 } 3957 }
2726 remove_all_osds(osdc); 3958 up_write(&osdc->lock);
3959 WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1);
3960 osd_cleanup(&osdc->homeless_osd);
3961
3962 WARN_ON(!list_empty(&osdc->osd_lru));
3963 WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests));
3964 WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks));
3965 WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks));
3966 WARN_ON(atomic_read(&osdc->num_requests));
3967 WARN_ON(atomic_read(&osdc->num_homeless));
3968
3969 ceph_osdmap_destroy(osdc->osdmap);
2727 mempool_destroy(osdc->req_mempool); 3970 mempool_destroy(osdc->req_mempool);
2728 ceph_msgpool_destroy(&osdc->msgpool_op); 3971 ceph_msgpool_destroy(&osdc->msgpool_op);
2729 ceph_msgpool_destroy(&osdc->msgpool_op_reply); 3972 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
@@ -2752,15 +3995,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
2752 return PTR_ERR(req); 3995 return PTR_ERR(req);
2753 3996
2754 /* it may be a short read due to an object boundary */ 3997 /* it may be a short read due to an object boundary */
2755
2756 osd_req_op_extent_osd_data_pages(req, 0, 3998 osd_req_op_extent_osd_data_pages(req, 0,
2757 pages, *plen, page_align, false, false); 3999 pages, *plen, page_align, false, false);
2758 4000
2759 dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", 4001 dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
2760 off, *plen, *plen, page_align); 4002 off, *plen, *plen, page_align);
2761 4003
2762 ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
2763
2764 rc = ceph_osdc_start_request(osdc, req, false); 4004 rc = ceph_osdc_start_request(osdc, req, false);
2765 if (!rc) 4005 if (!rc)
2766 rc = ceph_osdc_wait_request(osdc, req); 4006 rc = ceph_osdc_wait_request(osdc, req);
@@ -2786,7 +4026,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
2786 int rc = 0; 4026 int rc = 0;
2787 int page_align = off & ~PAGE_MASK; 4027 int page_align = off & ~PAGE_MASK;
2788 4028
2789 BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */
2790 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, 4029 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
2791 CEPH_OSD_OP_WRITE, 4030 CEPH_OSD_OP_WRITE,
2792 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 4031 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
@@ -2800,8 +4039,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
2800 false, false); 4039 false, false);
2801 dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); 4040 dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
2802 4041
2803 ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime); 4042 req->r_mtime = *mtime;
2804
2805 rc = ceph_osdc_start_request(osdc, req, true); 4043 rc = ceph_osdc_start_request(osdc, req, true);
2806 if (!rc) 4044 if (!rc)
2807 rc = ceph_osdc_wait_request(osdc, req); 4045 rc = ceph_osdc_wait_request(osdc, req);
@@ -2841,19 +4079,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup);
2841static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) 4079static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
2842{ 4080{
2843 struct ceph_osd *osd = con->private; 4081 struct ceph_osd *osd = con->private;
2844 struct ceph_osd_client *osdc; 4082 struct ceph_osd_client *osdc = osd->o_osdc;
2845 int type = le16_to_cpu(msg->hdr.type); 4083 int type = le16_to_cpu(msg->hdr.type);
2846 4084
2847 if (!osd)
2848 goto out;
2849 osdc = osd->o_osdc;
2850
2851 switch (type) { 4085 switch (type) {
2852 case CEPH_MSG_OSD_MAP: 4086 case CEPH_MSG_OSD_MAP:
2853 ceph_osdc_handle_map(osdc, msg); 4087 ceph_osdc_handle_map(osdc, msg);
2854 break; 4088 break;
2855 case CEPH_MSG_OSD_OPREPLY: 4089 case CEPH_MSG_OSD_OPREPLY:
2856 handle_reply(osdc, msg); 4090 handle_reply(osd, msg);
2857 break; 4091 break;
2858 case CEPH_MSG_WATCH_NOTIFY: 4092 case CEPH_MSG_WATCH_NOTIFY:
2859 handle_watch_notify(osdc, msg); 4093 handle_watch_notify(osdc, msg);
@@ -2863,7 +4097,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
2863 pr_err("received unknown message type %d %s\n", type, 4097 pr_err("received unknown message type %d %s\n", type,
2864 ceph_msg_type_name(type)); 4098 ceph_msg_type_name(type));
2865 } 4099 }
2866out: 4100
2867 ceph_msg_put(msg); 4101 ceph_msg_put(msg);
2868} 4102}
2869 4103
@@ -2878,21 +4112,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2878{ 4112{
2879 struct ceph_osd *osd = con->private; 4113 struct ceph_osd *osd = con->private;
2880 struct ceph_osd_client *osdc = osd->o_osdc; 4114 struct ceph_osd_client *osdc = osd->o_osdc;
2881 struct ceph_msg *m; 4115 struct ceph_msg *m = NULL;
2882 struct ceph_osd_request *req; 4116 struct ceph_osd_request *req;
2883 int front_len = le32_to_cpu(hdr->front_len); 4117 int front_len = le32_to_cpu(hdr->front_len);
2884 int data_len = le32_to_cpu(hdr->data_len); 4118 int data_len = le32_to_cpu(hdr->data_len);
2885 u64 tid; 4119 u64 tid = le64_to_cpu(hdr->tid);
2886 4120
2887 tid = le64_to_cpu(hdr->tid); 4121 down_read(&osdc->lock);
2888 mutex_lock(&osdc->request_mutex); 4122 if (!osd_registered(osd)) {
2889 req = __lookup_request(osdc, tid); 4123 dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd);
4124 *skip = 1;
4125 goto out_unlock_osdc;
4126 }
4127 WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num));
4128
4129 mutex_lock(&osd->lock);
4130 req = lookup_request(&osd->o_requests, tid);
2890 if (!req) { 4131 if (!req) {
2891 dout("%s osd%d tid %llu unknown, skipping\n", __func__, 4132 dout("%s osd%d tid %llu unknown, skipping\n", __func__,
2892 osd->o_osd, tid); 4133 osd->o_osd, tid);
2893 m = NULL;
2894 *skip = 1; 4134 *skip = 1;
2895 goto out; 4135 goto out_unlock_session;
2896 } 4136 }
2897 4137
2898 ceph_msg_revoke_incoming(req->r_reply); 4138 ceph_msg_revoke_incoming(req->r_reply);
@@ -2904,7 +4144,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2904 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, 4144 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
2905 false); 4145 false);
2906 if (!m) 4146 if (!m)
2907 goto out; 4147 goto out_unlock_session;
2908 ceph_msg_put(req->r_reply); 4148 ceph_msg_put(req->r_reply);
2909 req->r_reply = m; 4149 req->r_reply = m;
2910 } 4150 }
@@ -2915,14 +4155,49 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2915 req->r_reply->data_length); 4155 req->r_reply->data_length);
2916 m = NULL; 4156 m = NULL;
2917 *skip = 1; 4157 *skip = 1;
2918 goto out; 4158 goto out_unlock_session;
2919 } 4159 }
2920 4160
2921 m = ceph_msg_get(req->r_reply); 4161 m = ceph_msg_get(req->r_reply);
2922 dout("get_reply tid %lld %p\n", tid, m); 4162 dout("get_reply tid %lld %p\n", tid, m);
2923 4163
2924out: 4164out_unlock_session:
2925 mutex_unlock(&osdc->request_mutex); 4165 mutex_unlock(&osd->lock);
4166out_unlock_osdc:
4167 up_read(&osdc->lock);
4168 return m;
4169}
4170
4171/*
4172 * TODO: switch to a msg-owned pagelist
4173 */
4174static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
4175{
4176 struct ceph_msg *m;
4177 int type = le16_to_cpu(hdr->type);
4178 u32 front_len = le32_to_cpu(hdr->front_len);
4179 u32 data_len = le32_to_cpu(hdr->data_len);
4180
4181 m = ceph_msg_new(type, front_len, GFP_NOIO, false);
4182 if (!m)
4183 return NULL;
4184
4185 if (data_len) {
4186 struct page **pages;
4187 struct ceph_osd_data osd_data;
4188
4189 pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
4190 GFP_NOIO);
4191 if (!pages) {
4192 ceph_msg_put(m);
4193 return NULL;
4194 }
4195
4196 ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
4197 false);
4198 ceph_osdc_msg_data_add(m, &osd_data);
4199 }
4200
2926 return m; 4201 return m;
2927} 4202}
2928 4203
@@ -2932,18 +4207,17 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
2932{ 4207{
2933 struct ceph_osd *osd = con->private; 4208 struct ceph_osd *osd = con->private;
2934 int type = le16_to_cpu(hdr->type); 4209 int type = le16_to_cpu(hdr->type);
2935 int front = le32_to_cpu(hdr->front_len);
2936 4210
2937 *skip = 0; 4211 *skip = 0;
2938 switch (type) { 4212 switch (type) {
2939 case CEPH_MSG_OSD_MAP: 4213 case CEPH_MSG_OSD_MAP:
2940 case CEPH_MSG_WATCH_NOTIFY: 4214 case CEPH_MSG_WATCH_NOTIFY:
2941 return ceph_msg_new(type, front, GFP_NOFS, false); 4215 return alloc_msg_with_page_vector(hdr);
2942 case CEPH_MSG_OSD_OPREPLY: 4216 case CEPH_MSG_OSD_OPREPLY:
2943 return get_reply(con, hdr, skip); 4217 return get_reply(con, hdr, skip);
2944 default: 4218 default:
2945 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type, 4219 pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__,
2946 osd->o_osd); 4220 osd->o_osd, type);
2947 *skip = 1; 4221 *skip = 1;
2948 return NULL; 4222 return NULL;
2949 } 4223 }
@@ -3047,5 +4321,5 @@ static const struct ceph_connection_operations osd_con_ops = {
3047 .alloc_msg = alloc_msg, 4321 .alloc_msg = alloc_msg,
3048 .sign_message = osd_sign_message, 4322 .sign_message = osd_sign_message,
3049 .check_message_signature = osd_check_message_signature, 4323 .check_message_signature = osd_check_message_signature,
3050 .fault = osd_reset, 4324 .fault = osd_fault,
3051}; 4325};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 243574c8cf33..cde52e94732f 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -380,23 +380,24 @@ bad:
380 return ERR_PTR(err); 380 return ERR_PTR(err);
381} 381}
382 382
383/* 383int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
384 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
385 * to a set of osds) and primary_temp (explicit primary setting)
386 */
387static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
388{ 384{
389 if (l.pool < r.pool) 385 if (lhs->pool < rhs->pool)
390 return -1; 386 return -1;
391 if (l.pool > r.pool) 387 if (lhs->pool > rhs->pool)
392 return 1; 388 return 1;
393 if (l.seed < r.seed) 389 if (lhs->seed < rhs->seed)
394 return -1; 390 return -1;
395 if (l.seed > r.seed) 391 if (lhs->seed > rhs->seed)
396 return 1; 392 return 1;
393
397 return 0; 394 return 0;
398} 395}
399 396
397/*
398 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
399 * to a set of osds) and primary_temp (explicit primary setting)
400 */
400static int __insert_pg_mapping(struct ceph_pg_mapping *new, 401static int __insert_pg_mapping(struct ceph_pg_mapping *new,
401 struct rb_root *root) 402 struct rb_root *root)
402{ 403{
@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new,
409 while (*p) { 410 while (*p) {
410 parent = *p; 411 parent = *p;
411 pg = rb_entry(parent, struct ceph_pg_mapping, node); 412 pg = rb_entry(parent, struct ceph_pg_mapping, node);
412 c = pgid_cmp(new->pgid, pg->pgid); 413 c = ceph_pg_compare(&new->pgid, &pg->pgid);
413 if (c < 0) 414 if (c < 0)
414 p = &(*p)->rb_left; 415 p = &(*p)->rb_left;
415 else if (c > 0) 416 else if (c > 0)
@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
432 433
433 while (n) { 434 while (n) {
434 pg = rb_entry(n, struct ceph_pg_mapping, node); 435 pg = rb_entry(n, struct ceph_pg_mapping, node);
435 c = pgid_cmp(pgid, pg->pgid); 436 c = ceph_pg_compare(&pgid, &pg->pgid);
436 if (c < 0) { 437 if (c < 0) {
437 n = n->rb_left; 438 n = n->rb_left;
438 } else if (c > 0) { 439 } else if (c > 0) {
@@ -596,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
596 *p += 4; /* skip crash_replay_interval */ 597 *p += 4; /* skip crash_replay_interval */
597 598
598 if (ev >= 7) 599 if (ev >= 7)
599 *p += 1; /* skip min_size */ 600 pi->min_size = ceph_decode_8(p);
601 else
602 pi->min_size = pi->size - pi->size / 2;
600 603
601 if (ev >= 8) 604 if (ev >= 8)
602 *p += 8 + 8; /* skip quota_max_* */ 605 *p += 8 + 8; /* skip quota_max_* */
@@ -616,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
616 pi->write_tier = -1; 619 pi->write_tier = -1;
617 } 620 }
618 621
622 if (ev >= 10) {
623 /* skip properties */
624 num = ceph_decode_32(p);
625 while (num--) {
626 len = ceph_decode_32(p);
627 *p += len; /* key */
628 len = ceph_decode_32(p);
629 *p += len; /* val */
630 }
631 }
632
633 if (ev >= 11) {
634 /* skip hit_set_params */
635 *p += 1 + 1; /* versions */
636 len = ceph_decode_32(p);
637 *p += len;
638
639 *p += 4; /* skip hit_set_period */
640 *p += 4; /* skip hit_set_count */
641 }
642
643 if (ev >= 12)
644 *p += 4; /* skip stripe_width */
645
646 if (ev >= 13) {
647 *p += 8; /* skip target_max_bytes */
648 *p += 8; /* skip target_max_objects */
649 *p += 4; /* skip cache_target_dirty_ratio_micro */
650 *p += 4; /* skip cache_target_full_ratio_micro */
651 *p += 4; /* skip cache_min_flush_age */
652 *p += 4; /* skip cache_min_evict_age */
653 }
654
655 if (ev >= 14) {
656 /* skip erasure_code_profile */
657 len = ceph_decode_32(p);
658 *p += len;
659 }
660
661 if (ev >= 15)
662 pi->last_force_request_resend = ceph_decode_32(p);
663 else
664 pi->last_force_request_resend = 0;
665
619 /* ignore the rest */ 666 /* ignore the rest */
620 667
621 *p = pool_end; 668 *p = pool_end;
@@ -660,6 +707,23 @@ bad:
660/* 707/*
661 * osd map 708 * osd map
662 */ 709 */
710struct ceph_osdmap *ceph_osdmap_alloc(void)
711{
712 struct ceph_osdmap *map;
713
714 map = kzalloc(sizeof(*map), GFP_NOIO);
715 if (!map)
716 return NULL;
717
718 map->pg_pools = RB_ROOT;
719 map->pool_max = -1;
720 map->pg_temp = RB_ROOT;
721 map->primary_temp = RB_ROOT;
722 mutex_init(&map->crush_scratch_mutex);
723
724 return map;
725}
726
663void ceph_osdmap_destroy(struct ceph_osdmap *map) 727void ceph_osdmap_destroy(struct ceph_osdmap *map)
664{ 728{
665 dout("osdmap_destroy %p\n", map); 729 dout("osdmap_destroy %p\n", map);
@@ -1183,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
1183 struct ceph_osdmap *map; 1247 struct ceph_osdmap *map;
1184 int ret; 1248 int ret;
1185 1249
1186 map = kzalloc(sizeof(*map), GFP_NOFS); 1250 map = ceph_osdmap_alloc();
1187 if (!map) 1251 if (!map)
1188 return ERR_PTR(-ENOMEM); 1252 return ERR_PTR(-ENOMEM);
1189 1253
1190 map->pg_temp = RB_ROOT;
1191 map->primary_temp = RB_ROOT;
1192 mutex_init(&map->crush_scratch_mutex);
1193
1194 ret = osdmap_decode(p, end, map); 1254 ret = osdmap_decode(p, end, map);
1195 if (ret) { 1255 if (ret) {
1196 ceph_osdmap_destroy(map); 1256 ceph_osdmap_destroy(map);
@@ -1204,8 +1264,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
1204 * decode and apply an incremental map update. 1264 * decode and apply an incremental map update.
1205 */ 1265 */
1206struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, 1266struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
1207 struct ceph_osdmap *map, 1267 struct ceph_osdmap *map)
1208 struct ceph_messenger *msgr)
1209{ 1268{
1210 struct crush_map *newcrush = NULL; 1269 struct crush_map *newcrush = NULL;
1211 struct ceph_fsid fsid; 1270 struct ceph_fsid fsid;
@@ -1381,8 +1440,252 @@ bad:
1381 return ERR_PTR(err); 1440 return ERR_PTR(err);
1382} 1441}
1383 1442
1443void ceph_oid_copy(struct ceph_object_id *dest,
1444 const struct ceph_object_id *src)
1445{
1446 WARN_ON(!ceph_oid_empty(dest));
1447
1448 if (src->name != src->inline_name) {
1449 /* very rare, see ceph_object_id definition */
1450 dest->name = kmalloc(src->name_len + 1,
1451 GFP_NOIO | __GFP_NOFAIL);
1452 }
1453
1454 memcpy(dest->name, src->name, src->name_len + 1);
1455 dest->name_len = src->name_len;
1456}
1457EXPORT_SYMBOL(ceph_oid_copy);
1458
1459static __printf(2, 0)
1460int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap)
1461{
1462 int len;
1463
1464 WARN_ON(!ceph_oid_empty(oid));
1465
1466 len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap);
1467 if (len >= sizeof(oid->inline_name))
1468 return len;
1469
1470 oid->name_len = len;
1471 return 0;
1472}
1473
1474/*
1475 * If oid doesn't fit into inline buffer, BUG.
1476 */
1477void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...)
1478{
1479 va_list ap;
1480
1481 va_start(ap, fmt);
1482 BUG_ON(oid_printf_vargs(oid, fmt, ap));
1483 va_end(ap);
1484}
1485EXPORT_SYMBOL(ceph_oid_printf);
1486
1487static __printf(3, 0)
1488int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp,
1489 const char *fmt, va_list ap)
1490{
1491 va_list aq;
1492 int len;
1493
1494 va_copy(aq, ap);
1495 len = oid_printf_vargs(oid, fmt, aq);
1496 va_end(aq);
1497
1498 if (len) {
1499 char *external_name;
1500
1501 external_name = kmalloc(len + 1, gfp);
1502 if (!external_name)
1503 return -ENOMEM;
1504
1505 oid->name = external_name;
1506 WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len);
1507 oid->name_len = len;
1508 }
1509
1510 return 0;
1511}
1512
1513/*
1514 * If oid doesn't fit into inline buffer, allocate.
1515 */
1516int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
1517 const char *fmt, ...)
1518{
1519 va_list ap;
1520 int ret;
1521
1522 va_start(ap, fmt);
1523 ret = oid_aprintf_vargs(oid, gfp, fmt, ap);
1524 va_end(ap);
1525
1526 return ret;
1527}
1528EXPORT_SYMBOL(ceph_oid_aprintf);
1529
1530void ceph_oid_destroy(struct ceph_object_id *oid)
1531{
1532 if (oid->name != oid->inline_name)
1533 kfree(oid->name);
1534}
1535EXPORT_SYMBOL(ceph_oid_destroy);
1536
1537/*
1538 * osds only
1539 */
1540static bool __osds_equal(const struct ceph_osds *lhs,
1541 const struct ceph_osds *rhs)
1542{
1543 if (lhs->size == rhs->size &&
1544 !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0])))
1545 return true;
1546
1547 return false;
1548}
1549
1550/*
1551 * osds + primary
1552 */
1553static bool osds_equal(const struct ceph_osds *lhs,
1554 const struct ceph_osds *rhs)
1555{
1556 if (__osds_equal(lhs, rhs) &&
1557 lhs->primary == rhs->primary)
1558 return true;
1559
1560 return false;
1561}
1562
1563static bool osds_valid(const struct ceph_osds *set)
1564{
1565 /* non-empty set */
1566 if (set->size > 0 && set->primary >= 0)
1567 return true;
1568
1569 /* empty can_shift_osds set */
1570 if (!set->size && set->primary == -1)
1571 return true;
1572
1573 /* empty !can_shift_osds set - all NONE */
1574 if (set->size > 0 && set->primary == -1) {
1575 int i;
1576
1577 for (i = 0; i < set->size; i++) {
1578 if (set->osds[i] != CRUSH_ITEM_NONE)
1579 break;
1580 }
1581 if (i == set->size)
1582 return true;
1583 }
1584
1585 return false;
1586}
1587
1588void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
1589{
1590 memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0]));
1591 dest->size = src->size;
1592 dest->primary = src->primary;
1593}
1594
1595static bool is_split(const struct ceph_pg *pgid,
1596 u32 old_pg_num,
1597 u32 new_pg_num)
1598{
1599 int old_bits = calc_bits_of(old_pg_num);
1600 int old_mask = (1 << old_bits) - 1;
1601 int n;
1602
1603 WARN_ON(pgid->seed >= old_pg_num);
1604 if (new_pg_num <= old_pg_num)
1605 return false;
1606
1607 for (n = 1; ; n++) {
1608 int next_bit = n << (old_bits - 1);
1609 u32 s = next_bit | pgid->seed;
1610
1611 if (s < old_pg_num || s == pgid->seed)
1612 continue;
1613 if (s >= new_pg_num)
1614 break;
1615
1616 s = ceph_stable_mod(s, old_pg_num, old_mask);
1617 if (s == pgid->seed)
1618 return true;
1619 }
1620
1621 return false;
1622}
1623
1624bool ceph_is_new_interval(const struct ceph_osds *old_acting,
1625 const struct ceph_osds *new_acting,
1626 const struct ceph_osds *old_up,
1627 const struct ceph_osds *new_up,
1628 int old_size,
1629 int new_size,
1630 int old_min_size,
1631 int new_min_size,
1632 u32 old_pg_num,
1633 u32 new_pg_num,
1634 bool old_sort_bitwise,
1635 bool new_sort_bitwise,
1636 const struct ceph_pg *pgid)
1637{
1638 return !osds_equal(old_acting, new_acting) ||
1639 !osds_equal(old_up, new_up) ||
1640 old_size != new_size ||
1641 old_min_size != new_min_size ||
1642 is_split(pgid, old_pg_num, new_pg_num) ||
1643 old_sort_bitwise != new_sort_bitwise;
1644}
1645
1646static int calc_pg_rank(int osd, const struct ceph_osds *acting)
1647{
1648 int i;
1649
1650 for (i = 0; i < acting->size; i++) {
1651 if (acting->osds[i] == osd)
1652 return i;
1653 }
1654
1655 return -1;
1656}
1657
1658static bool primary_changed(const struct ceph_osds *old_acting,
1659 const struct ceph_osds *new_acting)
1660{
1661 if (!old_acting->size && !new_acting->size)
1662 return false; /* both still empty */
1384 1663
1664 if (!old_acting->size ^ !new_acting->size)
1665 return true; /* was empty, now not, or vice versa */
1385 1666
1667 if (old_acting->primary != new_acting->primary)
1668 return true; /* primary changed */
1669
1670 if (calc_pg_rank(old_acting->primary, old_acting) !=
1671 calc_pg_rank(new_acting->primary, new_acting))
1672 return true;
1673
1674 return false; /* same primary (tho replicas may have changed) */
1675}
1676
1677bool ceph_osds_changed(const struct ceph_osds *old_acting,
1678 const struct ceph_osds *new_acting,
1679 bool any_change)
1680{
1681 if (primary_changed(old_acting, new_acting))
1682 return true;
1683
1684 if (any_change && !__osds_equal(old_acting, new_acting))
1685 return true;
1686
1687 return false;
1688}
1386 1689
1387/* 1690/*
1388 * calculate file layout from given offset, length. 1691 * calculate file layout from given offset, length.
@@ -1455,30 +1758,71 @@ invalid:
1455EXPORT_SYMBOL(ceph_calc_file_object_mapping); 1758EXPORT_SYMBOL(ceph_calc_file_object_mapping);
1456 1759
1457/* 1760/*
1458 * Calculate mapping of a (oloc, oid) pair to a PG. Should only be 1761 * Map an object into a PG.
1459 * called with target's (oloc, oid), since tiering isn't taken into 1762 *
1460 * account. 1763 * Should only be called with target_oid and target_oloc (as opposed to
1764 * base_oid and base_oloc), since tiering isn't taken into account.
1461 */ 1765 */
1462int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, 1766int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
1463 struct ceph_object_locator *oloc, 1767 struct ceph_object_id *oid,
1464 struct ceph_object_id *oid, 1768 struct ceph_object_locator *oloc,
1465 struct ceph_pg *pg_out) 1769 struct ceph_pg *raw_pgid)
1466{ 1770{
1467 struct ceph_pg_pool_info *pi; 1771 struct ceph_pg_pool_info *pi;
1468 1772
1469 pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); 1773 pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
1470 if (!pi) 1774 if (!pi)
1471 return -EIO; 1775 return -ENOENT;
1472 1776
1473 pg_out->pool = oloc->pool; 1777 raw_pgid->pool = oloc->pool;
1474 pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, 1778 raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
1475 oid->name_len); 1779 oid->name_len);
1476 1780
1477 dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, 1781 dout("%s %*pE -> raw_pgid %llu.%x\n", __func__, oid->name_len,
1478 pg_out->pool, pg_out->seed); 1782 oid->name, raw_pgid->pool, raw_pgid->seed);
1479 return 0; 1783 return 0;
1480} 1784}
1481EXPORT_SYMBOL(ceph_oloc_oid_to_pg); 1785EXPORT_SYMBOL(ceph_object_locator_to_pg);
1786
1787/*
1788 * Map a raw PG (full precision ps) into an actual PG.
1789 */
1790static void raw_pg_to_pg(struct ceph_pg_pool_info *pi,
1791 const struct ceph_pg *raw_pgid,
1792 struct ceph_pg *pgid)
1793{
1794 pgid->pool = raw_pgid->pool;
1795 pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num,
1796 pi->pg_num_mask);
1797}
1798
1799/*
1800 * Map a raw PG (full precision ps) into a placement ps (placement
1801 * seed). Include pool id in that value so that different pools don't
1802 * use the same seeds.
1803 */
1804static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
1805 const struct ceph_pg *raw_pgid)
1806{
1807 if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
1808 /* hash pool id and seed so that pool PGs do not overlap */
1809 return crush_hash32_2(CRUSH_HASH_RJENKINS1,
1810 ceph_stable_mod(raw_pgid->seed,
1811 pi->pgp_num,
1812 pi->pgp_num_mask),
1813 raw_pgid->pool);
1814 } else {
1815 /*
1816 * legacy behavior: add ps and pool together. this is
1817 * not a great approach because the PGs from each pool
1818 * will overlap on top of each other: 0.5 == 1.4 ==
1819 * 2.3 == ...
1820 */
1821 return ceph_stable_mod(raw_pgid->seed, pi->pgp_num,
1822 pi->pgp_num_mask) +
1823 (unsigned)raw_pgid->pool;
1824 }
1825}
1482 1826
1483static int do_crush(struct ceph_osdmap *map, int ruleno, int x, 1827static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
1484 int *result, int result_max, 1828 int *result, int result_max,
@@ -1497,84 +1841,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
1497} 1841}
1498 1842
1499/* 1843/*
1500 * Calculate raw (crush) set for given pgid. 1844 * Calculate raw set (CRUSH output) for given PG. The result may
1845 * contain nonexistent OSDs. ->primary is undefined for a raw set.
1501 * 1846 *
1502 * Return raw set length, or error. 1847 * Placement seed (CRUSH input) is returned through @ppps.
1503 */ 1848 */
1504static int pg_to_raw_osds(struct ceph_osdmap *osdmap, 1849static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
1505 struct ceph_pg_pool_info *pool, 1850 struct ceph_pg_pool_info *pi,
1506 struct ceph_pg pgid, u32 pps, int *osds) 1851 const struct ceph_pg *raw_pgid,
1852 struct ceph_osds *raw,
1853 u32 *ppps)
1507{ 1854{
1855 u32 pps = raw_pg_to_pps(pi, raw_pgid);
1508 int ruleno; 1856 int ruleno;
1509 int len; 1857 int len;
1510 1858
1511 /* crush */ 1859 ceph_osds_init(raw);
1512 ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, 1860 if (ppps)
1513 pool->type, pool->size); 1861 *ppps = pps;
1862
1863 ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type,
1864 pi->size);
1514 if (ruleno < 0) { 1865 if (ruleno < 0) {
1515 pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", 1866 pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
1516 pgid.pool, pool->crush_ruleset, pool->type, 1867 pi->id, pi->crush_ruleset, pi->type, pi->size);
1517 pool->size); 1868 return;
1518 return -ENOENT;
1519 } 1869 }
1520 1870
1521 len = do_crush(osdmap, ruleno, pps, osds, 1871 len = do_crush(osdmap, ruleno, pps, raw->osds,
1522 min_t(int, pool->size, CEPH_PG_MAX_SIZE), 1872 min_t(int, pi->size, ARRAY_SIZE(raw->osds)),
1523 osdmap->osd_weight, osdmap->max_osd); 1873 osdmap->osd_weight, osdmap->max_osd);
1524 if (len < 0) { 1874 if (len < 0) {
1525 pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", 1875 pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
1526 len, ruleno, pgid.pool, pool->crush_ruleset, 1876 len, ruleno, pi->id, pi->crush_ruleset, pi->type,
1527 pool->type, pool->size); 1877 pi->size);
1528 return len; 1878 return;
1529 } 1879 }
1530 1880
1531 return len; 1881 raw->size = len;
1532} 1882}
1533 1883
1534/* 1884/*
1535 * Given raw set, calculate up set and up primary. 1885 * Given raw set, calculate up set and up primary. By definition of an
1886 * up set, the result won't contain nonexistent or down OSDs.
1536 * 1887 *
1537 * Return up set length. *primary is set to up primary osd id, or -1 1888 * This is done in-place - on return @set is the up set. If it's
1538 * if up set is empty. 1889 * empty, ->primary will remain undefined.
1539 */ 1890 */
1540static int raw_to_up_osds(struct ceph_osdmap *osdmap, 1891static void raw_to_up_osds(struct ceph_osdmap *osdmap,
1541 struct ceph_pg_pool_info *pool, 1892 struct ceph_pg_pool_info *pi,
1542 int *osds, int len, int *primary) 1893 struct ceph_osds *set)
1543{ 1894{
1544 int up_primary = -1;
1545 int i; 1895 int i;
1546 1896
1547 if (ceph_can_shift_osds(pool)) { 1897 /* ->primary is undefined for a raw set */
1898 BUG_ON(set->primary != -1);
1899
1900 if (ceph_can_shift_osds(pi)) {
1548 int removed = 0; 1901 int removed = 0;
1549 1902
1550 for (i = 0; i < len; i++) { 1903 /* shift left */
1551 if (ceph_osd_is_down(osdmap, osds[i])) { 1904 for (i = 0; i < set->size; i++) {
1905 if (ceph_osd_is_down(osdmap, set->osds[i])) {
1552 removed++; 1906 removed++;
1553 continue; 1907 continue;
1554 } 1908 }
1555 if (removed) 1909 if (removed)
1556 osds[i - removed] = osds[i]; 1910 set->osds[i - removed] = set->osds[i];
1557 } 1911 }
1558 1912 set->size -= removed;
1559 len -= removed; 1913 if (set->size > 0)
1560 if (len > 0) 1914 set->primary = set->osds[0];
1561 up_primary = osds[0];
1562 } else { 1915 } else {
1563 for (i = len - 1; i >= 0; i--) { 1916 /* set down/dne devices to NONE */
1564 if (ceph_osd_is_down(osdmap, osds[i])) 1917 for (i = set->size - 1; i >= 0; i--) {
1565 osds[i] = CRUSH_ITEM_NONE; 1918 if (ceph_osd_is_down(osdmap, set->osds[i]))
1919 set->osds[i] = CRUSH_ITEM_NONE;
1566 else 1920 else
1567 up_primary = osds[i]; 1921 set->primary = set->osds[i];
1568 } 1922 }
1569 } 1923 }
1570
1571 *primary = up_primary;
1572 return len;
1573} 1924}
1574 1925
1575static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, 1926static void apply_primary_affinity(struct ceph_osdmap *osdmap,
1576 struct ceph_pg_pool_info *pool, 1927 struct ceph_pg_pool_info *pi,
1577 int *osds, int len, int *primary) 1928 u32 pps,
1929 struct ceph_osds *up)
1578{ 1930{
1579 int i; 1931 int i;
1580 int pos = -1; 1932 int pos = -1;
@@ -1586,8 +1938,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
1586 if (!osdmap->osd_primary_affinity) 1938 if (!osdmap->osd_primary_affinity)
1587 return; 1939 return;
1588 1940
1589 for (i = 0; i < len; i++) { 1941 for (i = 0; i < up->size; i++) {
1590 int osd = osds[i]; 1942 int osd = up->osds[i];
1591 1943
1592 if (osd != CRUSH_ITEM_NONE && 1944 if (osd != CRUSH_ITEM_NONE &&
1593 osdmap->osd_primary_affinity[osd] != 1945 osdmap->osd_primary_affinity[osd] !=
@@ -1595,7 +1947,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
1595 break; 1947 break;
1596 } 1948 }
1597 } 1949 }
1598 if (i == len) 1950 if (i == up->size)
1599 return; 1951 return;
1600 1952
1601 /* 1953 /*
@@ -1603,8 +1955,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
1603 * osd into the hash/rng so that a proportional fraction of an 1955 * osd into the hash/rng so that a proportional fraction of an
1604 * osd's pgs get rejected as primary. 1956 * osd's pgs get rejected as primary.
1605 */ 1957 */
1606 for (i = 0; i < len; i++) { 1958 for (i = 0; i < up->size; i++) {
1607 int osd = osds[i]; 1959 int osd = up->osds[i];
1608 u32 aff; 1960 u32 aff;
1609 1961
1610 if (osd == CRUSH_ITEM_NONE) 1962 if (osd == CRUSH_ITEM_NONE)
@@ -1629,135 +1981,110 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
1629 if (pos < 0) 1981 if (pos < 0)
1630 return; 1982 return;
1631 1983
1632 *primary = osds[pos]; 1984 up->primary = up->osds[pos];
1633 1985
1634 if (ceph_can_shift_osds(pool) && pos > 0) { 1986 if (ceph_can_shift_osds(pi) && pos > 0) {
1635 /* move the new primary to the front */ 1987 /* move the new primary to the front */
1636 for (i = pos; i > 0; i--) 1988 for (i = pos; i > 0; i--)
1637 osds[i] = osds[i - 1]; 1989 up->osds[i] = up->osds[i - 1];
1638 osds[0] = *primary; 1990 up->osds[0] = up->primary;
1639 } 1991 }
1640} 1992}
1641 1993
1642/* 1994/*
1643 * Given up set, apply pg_temp and primary_temp mappings. 1995 * Get pg_temp and primary_temp mappings for given PG.
1644 * 1996 *
1645 * Return acting set length. *primary is set to acting primary osd id, 1997 * Note that a PG may have none, only pg_temp, only primary_temp or
1646 * or -1 if acting set is empty. 1998 * both pg_temp and primary_temp mappings. This means @temp isn't
1999 * always a valid OSD set on return: in the "only primary_temp" case,
2000 * @temp will have its ->primary >= 0 but ->size == 0.
1647 */ 2001 */
1648static int apply_temps(struct ceph_osdmap *osdmap, 2002static void get_temp_osds(struct ceph_osdmap *osdmap,
1649 struct ceph_pg_pool_info *pool, struct ceph_pg pgid, 2003 struct ceph_pg_pool_info *pi,
1650 int *osds, int len, int *primary) 2004 const struct ceph_pg *raw_pgid,
2005 struct ceph_osds *temp)
1651{ 2006{
2007 struct ceph_pg pgid;
1652 struct ceph_pg_mapping *pg; 2008 struct ceph_pg_mapping *pg;
1653 int temp_len;
1654 int temp_primary;
1655 int i; 2009 int i;
1656 2010
1657 /* raw_pg -> pg */ 2011 raw_pg_to_pg(pi, raw_pgid, &pgid);
1658 pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, 2012 ceph_osds_init(temp);
1659 pool->pg_num_mask);
1660 2013
1661 /* pg_temp? */ 2014 /* pg_temp? */
1662 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 2015 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1663 if (pg) { 2016 if (pg) {
1664 temp_len = 0;
1665 temp_primary = -1;
1666
1667 for (i = 0; i < pg->pg_temp.len; i++) { 2017 for (i = 0; i < pg->pg_temp.len; i++) {
1668 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2018 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
1669 if (ceph_can_shift_osds(pool)) 2019 if (ceph_can_shift_osds(pi))
1670 continue; 2020 continue;
1671 else 2021
1672 osds[temp_len++] = CRUSH_ITEM_NONE; 2022 temp->osds[temp->size++] = CRUSH_ITEM_NONE;
1673 } else { 2023 } else {
1674 osds[temp_len++] = pg->pg_temp.osds[i]; 2024 temp->osds[temp->size++] = pg->pg_temp.osds[i];
1675 } 2025 }
1676 } 2026 }
1677 2027
1678 /* apply pg_temp's primary */ 2028 /* apply pg_temp's primary */
1679 for (i = 0; i < temp_len; i++) { 2029 for (i = 0; i < temp->size; i++) {
1680 if (osds[i] != CRUSH_ITEM_NONE) { 2030 if (temp->osds[i] != CRUSH_ITEM_NONE) {
1681 temp_primary = osds[i]; 2031 temp->primary = temp->osds[i];
1682 break; 2032 break;
1683 } 2033 }
1684 } 2034 }
1685 } else {
1686 temp_len = len;
1687 temp_primary = *primary;
1688 } 2035 }
1689 2036
1690 /* primary_temp? */ 2037 /* primary_temp? */
1691 pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); 2038 pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
1692 if (pg) 2039 if (pg)
1693 temp_primary = pg->primary_temp.osd; 2040 temp->primary = pg->primary_temp.osd;
1694
1695 *primary = temp_primary;
1696 return temp_len;
1697} 2041}
1698 2042
1699/* 2043/*
1700 * Calculate acting set for given pgid. 2044 * Map a PG to its acting set as well as its up set.
1701 * 2045 *
1702 * Return acting set length, or error. *primary is set to acting 2046 * Acting set is used for data mapping purposes, while up set can be
1703 * primary osd id, or -1 if acting set is empty or on error. 2047 * recorded for detecting interval changes and deciding whether to
2048 * resend a request.
1704 */ 2049 */
1705int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, 2050void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
1706 int *osds, int *primary) 2051 const struct ceph_pg *raw_pgid,
2052 struct ceph_osds *up,
2053 struct ceph_osds *acting)
1707{ 2054{
1708 struct ceph_pg_pool_info *pool; 2055 struct ceph_pg_pool_info *pi;
1709 u32 pps; 2056 u32 pps;
1710 int len;
1711 2057
1712 pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); 2058 pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
1713 if (!pool) { 2059 if (!pi) {
1714 *primary = -1; 2060 ceph_osds_init(up);
1715 return -ENOENT; 2061 ceph_osds_init(acting);
2062 goto out;
1716 } 2063 }
1717 2064
1718 if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { 2065 pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
1719 /* hash pool id and seed so that pool PGs do not overlap */ 2066 raw_to_up_osds(osdmap, pi, up);
1720 pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, 2067 apply_primary_affinity(osdmap, pi, pps, up);
1721 ceph_stable_mod(pgid.seed, pool->pgp_num, 2068 get_temp_osds(osdmap, pi, raw_pgid, acting);
1722 pool->pgp_num_mask), 2069 if (!acting->size) {
1723 pgid.pool); 2070 memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
1724 } else { 2071 acting->size = up->size;
1725 /* 2072 if (acting->primary == -1)
1726 * legacy behavior: add ps and pool together. this is 2073 acting->primary = up->primary;
1727 * not a great approach because the PGs from each pool
1728 * will overlap on top of each other: 0.5 == 1.4 ==
1729 * 2.3 == ...
1730 */
1731 pps = ceph_stable_mod(pgid.seed, pool->pgp_num,
1732 pool->pgp_num_mask) +
1733 (unsigned)pgid.pool;
1734 }
1735
1736 len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
1737 if (len < 0) {
1738 *primary = -1;
1739 return len;
1740 } 2074 }
1741 2075out:
1742 len = raw_to_up_osds(osdmap, pool, osds, len, primary); 2076 WARN_ON(!osds_valid(up) || !osds_valid(acting));
1743
1744 apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
1745
1746 len = apply_temps(osdmap, pool, pgid, osds, len, primary);
1747
1748 return len;
1749} 2077}
1750 2078
1751/* 2079/*
1752 * Return primary osd for given pgid, or -1 if none. 2080 * Return acting primary for given PG, or -1 if none.
1753 */ 2081 */
1754int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) 2082int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
2083 const struct ceph_pg *raw_pgid)
1755{ 2084{
1756 int osds[CEPH_PG_MAX_SIZE]; 2085 struct ceph_osds up, acting;
1757 int primary;
1758
1759 ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
1760 2086
1761 return primary; 2087 ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting);
2088 return acting.primary;
1762} 2089}
1763EXPORT_SYMBOL(ceph_calc_pg_primary); 2090EXPORT_SYMBOL(ceph_pg_to_acting_primary);