diff options
-rw-r--r-- | drivers/block/rbd.c | 305 | ||||
-rw-r--r-- | fs/ceph/addr.c | 214 | ||||
-rw-r--r-- | fs/ceph/cache.c | 2 | ||||
-rw-r--r-- | fs/ceph/caps.c | 51 | ||||
-rw-r--r-- | fs/ceph/debugfs.c | 2 | ||||
-rw-r--r-- | fs/ceph/dir.c | 376 | ||||
-rw-r--r-- | fs/ceph/file.c | 89 | ||||
-rw-r--r-- | fs/ceph/inode.c | 159 | ||||
-rw-r--r-- | fs/ceph/ioctl.c | 14 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 140 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 17 | ||||
-rw-r--r-- | fs/ceph/mdsmap.c | 43 | ||||
-rw-r--r-- | fs/ceph/super.c | 47 | ||||
-rw-r--r-- | fs/ceph/super.h | 12 | ||||
-rw-r--r-- | fs/ceph/xattr.c | 25 | ||||
-rw-r--r-- | include/linux/ceph/ceph_frag.h | 4 | ||||
-rw-r--r-- | include/linux/ceph/ceph_fs.h | 20 | ||||
-rw-r--r-- | include/linux/ceph/decode.h | 2 | ||||
-rw-r--r-- | include/linux/ceph/libceph.h | 57 | ||||
-rw-r--r-- | include/linux/ceph/mon_client.h | 23 | ||||
-rw-r--r-- | include/linux/ceph/osd_client.h | 231 | ||||
-rw-r--r-- | include/linux/ceph/osdmap.h | 158 | ||||
-rw-r--r-- | include/linux/ceph/rados.h | 34 | ||||
-rw-r--r-- | net/ceph/ceph_common.c | 2 | ||||
-rw-r--r-- | net/ceph/ceph_strings.c | 16 | ||||
-rw-r--r-- | net/ceph/debugfs.c | 147 | ||||
-rw-r--r-- | net/ceph/mon_client.c | 393 | ||||
-rw-r--r-- | net/ceph/osd_client.c | 4032 | ||||
-rw-r--r-- | net/ceph/osdmap.c | 651 |
29 files changed, 4758 insertions, 2508 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0ede6d7e2568..81666a56415e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c | |||
@@ -350,12 +350,12 @@ struct rbd_device { | |||
350 | struct rbd_spec *spec; | 350 | struct rbd_spec *spec; |
351 | struct rbd_options *opts; | 351 | struct rbd_options *opts; |
352 | 352 | ||
353 | char *header_name; | 353 | struct ceph_object_id header_oid; |
354 | struct ceph_object_locator header_oloc; | ||
354 | 355 | ||
355 | struct ceph_file_layout layout; | 356 | struct ceph_file_layout layout; |
356 | 357 | ||
357 | struct ceph_osd_event *watch_event; | 358 | struct ceph_osd_linger_request *watch_handle; |
358 | struct rbd_obj_request *watch_request; | ||
359 | 359 | ||
360 | struct rbd_spec *parent_spec; | 360 | struct rbd_spec *parent_spec; |
361 | u64 parent_overlap; | 361 | u64 parent_overlap; |
@@ -1596,12 +1596,6 @@ static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) | |||
1596 | return __rbd_obj_request_wait(obj_request, 0); | 1596 | return __rbd_obj_request_wait(obj_request, 0); |
1597 | } | 1597 | } |
1598 | 1598 | ||
1599 | static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request, | ||
1600 | unsigned long timeout) | ||
1601 | { | ||
1602 | return __rbd_obj_request_wait(obj_request, timeout); | ||
1603 | } | ||
1604 | |||
1605 | static void rbd_img_request_complete(struct rbd_img_request *img_request) | 1599 | static void rbd_img_request_complete(struct rbd_img_request *img_request) |
1606 | { | 1600 | { |
1607 | 1601 | ||
@@ -1751,12 +1745,6 @@ static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) | |||
1751 | complete_all(&obj_request->completion); | 1745 | complete_all(&obj_request->completion); |
1752 | } | 1746 | } |
1753 | 1747 | ||
1754 | static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) | ||
1755 | { | ||
1756 | dout("%s: obj %p\n", __func__, obj_request); | ||
1757 | obj_request_done_set(obj_request); | ||
1758 | } | ||
1759 | |||
1760 | static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) | 1748 | static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) |
1761 | { | 1749 | { |
1762 | struct rbd_img_request *img_request = NULL; | 1750 | struct rbd_img_request *img_request = NULL; |
@@ -1828,13 +1816,12 @@ static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) | |||
1828 | obj_request_done_set(obj_request); | 1816 | obj_request_done_set(obj_request); |
1829 | } | 1817 | } |
1830 | 1818 | ||
1831 | static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, | 1819 | static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) |
1832 | struct ceph_msg *msg) | ||
1833 | { | 1820 | { |
1834 | struct rbd_obj_request *obj_request = osd_req->r_priv; | 1821 | struct rbd_obj_request *obj_request = osd_req->r_priv; |
1835 | u16 opcode; | 1822 | u16 opcode; |
1836 | 1823 | ||
1837 | dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); | 1824 | dout("%s: osd_req %p\n", __func__, osd_req); |
1838 | rbd_assert(osd_req == obj_request->osd_req); | 1825 | rbd_assert(osd_req == obj_request->osd_req); |
1839 | if (obj_request_img_data_test(obj_request)) { | 1826 | if (obj_request_img_data_test(obj_request)) { |
1840 | rbd_assert(obj_request->img_request); | 1827 | rbd_assert(obj_request->img_request); |
@@ -1878,10 +1865,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, | |||
1878 | case CEPH_OSD_OP_CALL: | 1865 | case CEPH_OSD_OP_CALL: |
1879 | rbd_osd_call_callback(obj_request); | 1866 | rbd_osd_call_callback(obj_request); |
1880 | break; | 1867 | break; |
1881 | case CEPH_OSD_OP_NOTIFY_ACK: | ||
1882 | case CEPH_OSD_OP_WATCH: | ||
1883 | rbd_osd_trivial_callback(obj_request); | ||
1884 | break; | ||
1885 | default: | 1868 | default: |
1886 | rbd_warn(NULL, "%s: unsupported op %hu", | 1869 | rbd_warn(NULL, "%s: unsupported op %hu", |
1887 | obj_request->object_name, (unsigned short) opcode); | 1870 | obj_request->object_name, (unsigned short) opcode); |
@@ -1896,27 +1879,17 @@ static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) | |||
1896 | { | 1879 | { |
1897 | struct rbd_img_request *img_request = obj_request->img_request; | 1880 | struct rbd_img_request *img_request = obj_request->img_request; |
1898 | struct ceph_osd_request *osd_req = obj_request->osd_req; | 1881 | struct ceph_osd_request *osd_req = obj_request->osd_req; |
1899 | u64 snap_id; | ||
1900 | 1882 | ||
1901 | rbd_assert(osd_req != NULL); | 1883 | if (img_request) |
1902 | 1884 | osd_req->r_snapid = img_request->snap_id; | |
1903 | snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; | ||
1904 | ceph_osdc_build_request(osd_req, obj_request->offset, | ||
1905 | NULL, snap_id, NULL); | ||
1906 | } | 1885 | } |
1907 | 1886 | ||
1908 | static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) | 1887 | static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) |
1909 | { | 1888 | { |
1910 | struct rbd_img_request *img_request = obj_request->img_request; | ||
1911 | struct ceph_osd_request *osd_req = obj_request->osd_req; | 1889 | struct ceph_osd_request *osd_req = obj_request->osd_req; |
1912 | struct ceph_snap_context *snapc; | ||
1913 | struct timespec mtime = CURRENT_TIME; | ||
1914 | 1890 | ||
1915 | rbd_assert(osd_req != NULL); | 1891 | osd_req->r_mtime = CURRENT_TIME; |
1916 | 1892 | osd_req->r_data_offset = obj_request->offset; | |
1917 | snapc = img_request ? img_request->snapc : NULL; | ||
1918 | ceph_osdc_build_request(osd_req, obj_request->offset, | ||
1919 | snapc, CEPH_NOSNAP, &mtime); | ||
1920 | } | 1893 | } |
1921 | 1894 | ||
1922 | /* | 1895 | /* |
@@ -1954,7 +1927,7 @@ static struct ceph_osd_request *rbd_osd_req_create( | |||
1954 | osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, | 1927 | osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, |
1955 | GFP_NOIO); | 1928 | GFP_NOIO); |
1956 | if (!osd_req) | 1929 | if (!osd_req) |
1957 | return NULL; /* ENOMEM */ | 1930 | goto fail; |
1958 | 1931 | ||
1959 | if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) | 1932 | if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) |
1960 | osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; | 1933 | osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; |
@@ -1965,9 +1938,18 @@ static struct ceph_osd_request *rbd_osd_req_create( | |||
1965 | osd_req->r_priv = obj_request; | 1938 | osd_req->r_priv = obj_request; |
1966 | 1939 | ||
1967 | osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); | 1940 | osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); |
1968 | ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); | 1941 | if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", |
1942 | obj_request->object_name)) | ||
1943 | goto fail; | ||
1944 | |||
1945 | if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) | ||
1946 | goto fail; | ||
1969 | 1947 | ||
1970 | return osd_req; | 1948 | return osd_req; |
1949 | |||
1950 | fail: | ||
1951 | ceph_osdc_put_request(osd_req); | ||
1952 | return NULL; | ||
1971 | } | 1953 | } |
1972 | 1954 | ||
1973 | /* | 1955 | /* |
@@ -2003,16 +1985,25 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) | |||
2003 | osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, | 1985 | osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, |
2004 | false, GFP_NOIO); | 1986 | false, GFP_NOIO); |
2005 | if (!osd_req) | 1987 | if (!osd_req) |
2006 | return NULL; /* ENOMEM */ | 1988 | goto fail; |
2007 | 1989 | ||
2008 | osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; | 1990 | osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; |
2009 | osd_req->r_callback = rbd_osd_req_callback; | 1991 | osd_req->r_callback = rbd_osd_req_callback; |
2010 | osd_req->r_priv = obj_request; | 1992 | osd_req->r_priv = obj_request; |
2011 | 1993 | ||
2012 | osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); | 1994 | osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); |
2013 | ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); | 1995 | if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", |
1996 | obj_request->object_name)) | ||
1997 | goto fail; | ||
1998 | |||
1999 | if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) | ||
2000 | goto fail; | ||
2014 | 2001 | ||
2015 | return osd_req; | 2002 | return osd_req; |
2003 | |||
2004 | fail: | ||
2005 | ceph_osdc_put_request(osd_req); | ||
2006 | return NULL; | ||
2016 | } | 2007 | } |
2017 | 2008 | ||
2018 | 2009 | ||
@@ -2973,17 +2964,20 @@ static int rbd_img_request_submit(struct rbd_img_request *img_request) | |||
2973 | { | 2964 | { |
2974 | struct rbd_obj_request *obj_request; | 2965 | struct rbd_obj_request *obj_request; |
2975 | struct rbd_obj_request *next_obj_request; | 2966 | struct rbd_obj_request *next_obj_request; |
2967 | int ret = 0; | ||
2976 | 2968 | ||
2977 | dout("%s: img %p\n", __func__, img_request); | 2969 | dout("%s: img %p\n", __func__, img_request); |
2978 | for_each_obj_request_safe(img_request, obj_request, next_obj_request) { | ||
2979 | int ret; | ||
2980 | 2970 | ||
2971 | rbd_img_request_get(img_request); | ||
2972 | for_each_obj_request_safe(img_request, obj_request, next_obj_request) { | ||
2981 | ret = rbd_img_obj_request_submit(obj_request); | 2973 | ret = rbd_img_obj_request_submit(obj_request); |
2982 | if (ret) | 2974 | if (ret) |
2983 | return ret; | 2975 | goto out_put_ireq; |
2984 | } | 2976 | } |
2985 | 2977 | ||
2986 | return 0; | 2978 | out_put_ireq: |
2979 | rbd_img_request_put(img_request); | ||
2980 | return ret; | ||
2987 | } | 2981 | } |
2988 | 2982 | ||
2989 | static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) | 2983 | static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) |
@@ -3090,45 +3084,18 @@ out_err: | |||
3090 | obj_request_done_set(obj_request); | 3084 | obj_request_done_set(obj_request); |
3091 | } | 3085 | } |
3092 | 3086 | ||
3093 | static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) | 3087 | static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev); |
3094 | { | 3088 | static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev); |
3095 | struct rbd_obj_request *obj_request; | ||
3096 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; | ||
3097 | int ret; | ||
3098 | |||
3099 | obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, | ||
3100 | OBJ_REQUEST_NODATA); | ||
3101 | if (!obj_request) | ||
3102 | return -ENOMEM; | ||
3103 | |||
3104 | ret = -ENOMEM; | ||
3105 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, | ||
3106 | obj_request); | ||
3107 | if (!obj_request->osd_req) | ||
3108 | goto out; | ||
3109 | |||
3110 | osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, | ||
3111 | notify_id, 0, 0); | ||
3112 | rbd_osd_req_format_read(obj_request); | ||
3113 | 3089 | ||
3114 | ret = rbd_obj_request_submit(osdc, obj_request); | 3090 | static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, |
3115 | if (ret) | 3091 | u64 notifier_id, void *data, size_t data_len) |
3116 | goto out; | ||
3117 | ret = rbd_obj_request_wait(obj_request); | ||
3118 | out: | ||
3119 | rbd_obj_request_put(obj_request); | ||
3120 | |||
3121 | return ret; | ||
3122 | } | ||
3123 | |||
3124 | static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) | ||
3125 | { | 3092 | { |
3126 | struct rbd_device *rbd_dev = (struct rbd_device *)data; | 3093 | struct rbd_device *rbd_dev = arg; |
3094 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; | ||
3127 | int ret; | 3095 | int ret; |
3128 | 3096 | ||
3129 | dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, | 3097 | dout("%s rbd_dev %p cookie %llu notify_id %llu\n", __func__, rbd_dev, |
3130 | rbd_dev->header_name, (unsigned long long)notify_id, | 3098 | cookie, notify_id); |
3131 | (unsigned int)opcode); | ||
3132 | 3099 | ||
3133 | /* | 3100 | /* |
3134 | * Until adequate refresh error handling is in place, there is | 3101 | * Until adequate refresh error handling is in place, there is |
@@ -3140,63 +3107,31 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) | |||
3140 | if (ret) | 3107 | if (ret) |
3141 | rbd_warn(rbd_dev, "refresh failed: %d", ret); | 3108 | rbd_warn(rbd_dev, "refresh failed: %d", ret); |
3142 | 3109 | ||
3143 | ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id); | 3110 | ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid, |
3111 | &rbd_dev->header_oloc, notify_id, cookie, | ||
3112 | NULL, 0); | ||
3144 | if (ret) | 3113 | if (ret) |
3145 | rbd_warn(rbd_dev, "notify_ack ret %d", ret); | 3114 | rbd_warn(rbd_dev, "notify_ack ret %d", ret); |
3146 | } | 3115 | } |
3147 | 3116 | ||
3148 | /* | 3117 | static void rbd_watch_errcb(void *arg, u64 cookie, int err) |
3149 | * Send a (un)watch request and wait for the ack. Return a request | ||
3150 | * with a ref held on success or error. | ||
3151 | */ | ||
3152 | static struct rbd_obj_request *rbd_obj_watch_request_helper( | ||
3153 | struct rbd_device *rbd_dev, | ||
3154 | bool watch) | ||
3155 | { | 3118 | { |
3156 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; | 3119 | struct rbd_device *rbd_dev = arg; |
3157 | struct ceph_options *opts = osdc->client->options; | ||
3158 | struct rbd_obj_request *obj_request; | ||
3159 | int ret; | 3120 | int ret; |
3160 | 3121 | ||
3161 | obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, | 3122 | rbd_warn(rbd_dev, "encountered watch error: %d", err); |
3162 | OBJ_REQUEST_NODATA); | ||
3163 | if (!obj_request) | ||
3164 | return ERR_PTR(-ENOMEM); | ||
3165 | |||
3166 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1, | ||
3167 | obj_request); | ||
3168 | if (!obj_request->osd_req) { | ||
3169 | ret = -ENOMEM; | ||
3170 | goto out; | ||
3171 | } | ||
3172 | |||
3173 | osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, | ||
3174 | rbd_dev->watch_event->cookie, 0, watch); | ||
3175 | rbd_osd_req_format_write(obj_request); | ||
3176 | 3123 | ||
3177 | if (watch) | 3124 | __rbd_dev_header_unwatch_sync(rbd_dev); |
3178 | ceph_osdc_set_request_linger(osdc, obj_request->osd_req); | ||
3179 | |||
3180 | ret = rbd_obj_request_submit(osdc, obj_request); | ||
3181 | if (ret) | ||
3182 | goto out; | ||
3183 | 3125 | ||
3184 | ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout); | 3126 | ret = rbd_dev_header_watch_sync(rbd_dev); |
3185 | if (ret) | ||
3186 | goto out; | ||
3187 | |||
3188 | ret = obj_request->result; | ||
3189 | if (ret) { | 3127 | if (ret) { |
3190 | if (watch) | 3128 | rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); |
3191 | rbd_obj_request_end(obj_request); | 3129 | return; |
3192 | goto out; | ||
3193 | } | 3130 | } |
3194 | 3131 | ||
3195 | return obj_request; | 3132 | ret = rbd_dev_refresh(rbd_dev); |
3196 | 3133 | if (ret) | |
3197 | out: | 3134 | rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret); |
3198 | rbd_obj_request_put(obj_request); | ||
3199 | return ERR_PTR(ret); | ||
3200 | } | 3135 | } |
3201 | 3136 | ||
3202 | /* | 3137 | /* |
@@ -3205,35 +3140,33 @@ out: | |||
3205 | static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) | 3140 | static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) |
3206 | { | 3141 | { |
3207 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; | 3142 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
3208 | struct rbd_obj_request *obj_request; | 3143 | struct ceph_osd_linger_request *handle; |
3209 | int ret; | ||
3210 | 3144 | ||
3211 | rbd_assert(!rbd_dev->watch_event); | 3145 | rbd_assert(!rbd_dev->watch_handle); |
3212 | rbd_assert(!rbd_dev->watch_request); | ||
3213 | 3146 | ||
3214 | ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, | 3147 | handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid, |
3215 | &rbd_dev->watch_event); | 3148 | &rbd_dev->header_oloc, rbd_watch_cb, |
3216 | if (ret < 0) | 3149 | rbd_watch_errcb, rbd_dev); |
3217 | return ret; | 3150 | if (IS_ERR(handle)) |
3151 | return PTR_ERR(handle); | ||
3218 | 3152 | ||
3219 | obj_request = rbd_obj_watch_request_helper(rbd_dev, true); | 3153 | rbd_dev->watch_handle = handle; |
3220 | if (IS_ERR(obj_request)) { | 3154 | return 0; |
3221 | ceph_osdc_cancel_event(rbd_dev->watch_event); | 3155 | } |
3222 | rbd_dev->watch_event = NULL; | ||
3223 | return PTR_ERR(obj_request); | ||
3224 | } | ||
3225 | 3156 | ||
3226 | /* | 3157 | static void __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) |
3227 | * A watch request is set to linger, so the underlying osd | 3158 | { |
3228 | * request won't go away until we unregister it. We retain | 3159 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
3229 | * a pointer to the object request during that time (in | 3160 | int ret; |
3230 | * rbd_dev->watch_request), so we'll keep a reference to it. | ||
3231 | * We'll drop that reference after we've unregistered it in | ||
3232 | * rbd_dev_header_unwatch_sync(). | ||
3233 | */ | ||
3234 | rbd_dev->watch_request = obj_request; | ||
3235 | 3161 | ||
3236 | return 0; | 3162 | if (!rbd_dev->watch_handle) |
3163 | return; | ||
3164 | |||
3165 | ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle); | ||
3166 | if (ret) | ||
3167 | rbd_warn(rbd_dev, "failed to unwatch: %d", ret); | ||
3168 | |||
3169 | rbd_dev->watch_handle = NULL; | ||
3237 | } | 3170 | } |
3238 | 3171 | ||
3239 | /* | 3172 | /* |
@@ -3241,24 +3174,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) | |||
3241 | */ | 3174 | */ |
3242 | static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) | 3175 | static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) |
3243 | { | 3176 | { |
3244 | struct rbd_obj_request *obj_request; | 3177 | __rbd_dev_header_unwatch_sync(rbd_dev); |
3245 | |||
3246 | rbd_assert(rbd_dev->watch_event); | ||
3247 | rbd_assert(rbd_dev->watch_request); | ||
3248 | |||
3249 | rbd_obj_request_end(rbd_dev->watch_request); | ||
3250 | rbd_obj_request_put(rbd_dev->watch_request); | ||
3251 | rbd_dev->watch_request = NULL; | ||
3252 | |||
3253 | obj_request = rbd_obj_watch_request_helper(rbd_dev, false); | ||
3254 | if (!IS_ERR(obj_request)) | ||
3255 | rbd_obj_request_put(obj_request); | ||
3256 | else | ||
3257 | rbd_warn(rbd_dev, "unable to tear down watch request (%ld)", | ||
3258 | PTR_ERR(obj_request)); | ||
3259 | |||
3260 | ceph_osdc_cancel_event(rbd_dev->watch_event); | ||
3261 | rbd_dev->watch_event = NULL; | ||
3262 | 3178 | ||
3263 | dout("%s flushing notifies\n", __func__); | 3179 | dout("%s flushing notifies\n", __func__); |
3264 | ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); | 3180 | ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); |
@@ -3591,7 +3507,7 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) | |||
3591 | if (!ondisk) | 3507 | if (!ondisk) |
3592 | return -ENOMEM; | 3508 | return -ENOMEM; |
3593 | 3509 | ||
3594 | ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, | 3510 | ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name, |
3595 | 0, size, ondisk); | 3511 | 0, size, ondisk); |
3596 | if (ret < 0) | 3512 | if (ret < 0) |
3597 | goto out; | 3513 | goto out; |
@@ -4033,6 +3949,8 @@ static void rbd_dev_release(struct device *dev) | |||
4033 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); | 3949 | struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); |
4034 | bool need_put = !!rbd_dev->opts; | 3950 | bool need_put = !!rbd_dev->opts; |
4035 | 3951 | ||
3952 | ceph_oid_destroy(&rbd_dev->header_oid); | ||
3953 | |||
4036 | rbd_put_client(rbd_dev->rbd_client); | 3954 | rbd_put_client(rbd_dev->rbd_client); |
4037 | rbd_spec_put(rbd_dev->spec); | 3955 | rbd_spec_put(rbd_dev->spec); |
4038 | kfree(rbd_dev->opts); | 3956 | kfree(rbd_dev->opts); |
@@ -4063,6 +3981,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, | |||
4063 | INIT_LIST_HEAD(&rbd_dev->node); | 3981 | INIT_LIST_HEAD(&rbd_dev->node); |
4064 | init_rwsem(&rbd_dev->header_rwsem); | 3982 | init_rwsem(&rbd_dev->header_rwsem); |
4065 | 3983 | ||
3984 | ceph_oid_init(&rbd_dev->header_oid); | ||
3985 | ceph_oloc_init(&rbd_dev->header_oloc); | ||
3986 | |||
4066 | rbd_dev->dev.bus = &rbd_bus_type; | 3987 | rbd_dev->dev.bus = &rbd_bus_type; |
4067 | rbd_dev->dev.type = &rbd_device_type; | 3988 | rbd_dev->dev.type = &rbd_device_type; |
4068 | rbd_dev->dev.parent = &rbd_root_dev; | 3989 | rbd_dev->dev.parent = &rbd_root_dev; |
@@ -4111,7 +4032,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, | |||
4111 | __le64 size; | 4032 | __le64 size; |
4112 | } __attribute__ ((packed)) size_buf = { 0 }; | 4033 | } __attribute__ ((packed)) size_buf = { 0 }; |
4113 | 4034 | ||
4114 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, | 4035 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, |
4115 | "rbd", "get_size", | 4036 | "rbd", "get_size", |
4116 | &snapid, sizeof (snapid), | 4037 | &snapid, sizeof (snapid), |
4117 | &size_buf, sizeof (size_buf)); | 4038 | &size_buf, sizeof (size_buf)); |
@@ -4151,7 +4072,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) | |||
4151 | if (!reply_buf) | 4072 | if (!reply_buf) |
4152 | return -ENOMEM; | 4073 | return -ENOMEM; |
4153 | 4074 | ||
4154 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, | 4075 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, |
4155 | "rbd", "get_object_prefix", NULL, 0, | 4076 | "rbd", "get_object_prefix", NULL, 0, |
4156 | reply_buf, RBD_OBJ_PREFIX_LEN_MAX); | 4077 | reply_buf, RBD_OBJ_PREFIX_LEN_MAX); |
4157 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 4078 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
@@ -4186,7 +4107,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, | |||
4186 | u64 unsup; | 4107 | u64 unsup; |
4187 | int ret; | 4108 | int ret; |
4188 | 4109 | ||
4189 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, | 4110 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, |
4190 | "rbd", "get_features", | 4111 | "rbd", "get_features", |
4191 | &snapid, sizeof (snapid), | 4112 | &snapid, sizeof (snapid), |
4192 | &features_buf, sizeof (features_buf)); | 4113 | &features_buf, sizeof (features_buf)); |
@@ -4248,7 +4169,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) | |||
4248 | } | 4169 | } |
4249 | 4170 | ||
4250 | snapid = cpu_to_le64(rbd_dev->spec->snap_id); | 4171 | snapid = cpu_to_le64(rbd_dev->spec->snap_id); |
4251 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, | 4172 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, |
4252 | "rbd", "get_parent", | 4173 | "rbd", "get_parent", |
4253 | &snapid, sizeof (snapid), | 4174 | &snapid, sizeof (snapid), |
4254 | reply_buf, size); | 4175 | reply_buf, size); |
@@ -4351,7 +4272,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) | |||
4351 | u64 stripe_count; | 4272 | u64 stripe_count; |
4352 | int ret; | 4273 | int ret; |
4353 | 4274 | ||
4354 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, | 4275 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, |
4355 | "rbd", "get_stripe_unit_count", NULL, 0, | 4276 | "rbd", "get_stripe_unit_count", NULL, 0, |
4356 | (char *)&striping_info_buf, size); | 4277 | (char *)&striping_info_buf, size); |
4357 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 4278 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
@@ -4599,7 +4520,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) | |||
4599 | if (!reply_buf) | 4520 | if (!reply_buf) |
4600 | return -ENOMEM; | 4521 | return -ENOMEM; |
4601 | 4522 | ||
4602 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, | 4523 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, |
4603 | "rbd", "get_snapcontext", NULL, 0, | 4524 | "rbd", "get_snapcontext", NULL, 0, |
4604 | reply_buf, size); | 4525 | reply_buf, size); |
4605 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 4526 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
@@ -4664,7 +4585,7 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, | |||
4664 | return ERR_PTR(-ENOMEM); | 4585 | return ERR_PTR(-ENOMEM); |
4665 | 4586 | ||
4666 | snapid = cpu_to_le64(snap_id); | 4587 | snapid = cpu_to_le64(snap_id); |
4667 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, | 4588 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, |
4668 | "rbd", "get_snapshot_name", | 4589 | "rbd", "get_snapshot_name", |
4669 | &snapid, sizeof (snapid), | 4590 | &snapid, sizeof (snapid), |
4670 | reply_buf, size); | 4591 | reply_buf, size); |
@@ -4975,13 +4896,13 @@ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) | |||
4975 | again: | 4896 | again: |
4976 | ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); | 4897 | ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); |
4977 | if (ret == -ENOENT && tries++ < 1) { | 4898 | if (ret == -ENOENT && tries++ < 1) { |
4978 | ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", | 4899 | ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap", |
4979 | &newest_epoch); | 4900 | &newest_epoch); |
4980 | if (ret < 0) | 4901 | if (ret < 0) |
4981 | return ret; | 4902 | return ret; |
4982 | 4903 | ||
4983 | if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { | 4904 | if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { |
4984 | ceph_monc_request_next_osdmap(&rbdc->client->monc); | 4905 | ceph_osdc_maybe_request_map(&rbdc->client->osdc); |
4985 | (void) ceph_monc_wait_osdmap(&rbdc->client->monc, | 4906 | (void) ceph_monc_wait_osdmap(&rbdc->client->monc, |
4986 | newest_epoch, | 4907 | newest_epoch, |
4987 | opts->mount_timeout); | 4908 | opts->mount_timeout); |
@@ -5260,35 +5181,26 @@ err_out_unlock: | |||
5260 | static int rbd_dev_header_name(struct rbd_device *rbd_dev) | 5181 | static int rbd_dev_header_name(struct rbd_device *rbd_dev) |
5261 | { | 5182 | { |
5262 | struct rbd_spec *spec = rbd_dev->spec; | 5183 | struct rbd_spec *spec = rbd_dev->spec; |
5263 | size_t size; | 5184 | int ret; |
5264 | 5185 | ||
5265 | /* Record the header object name for this rbd image. */ | 5186 | /* Record the header object name for this rbd image. */ |
5266 | 5187 | ||
5267 | rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); | 5188 | rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); |
5268 | 5189 | ||
5190 | rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); | ||
5269 | if (rbd_dev->image_format == 1) | 5191 | if (rbd_dev->image_format == 1) |
5270 | size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); | 5192 | ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", |
5193 | spec->image_name, RBD_SUFFIX); | ||
5271 | else | 5194 | else |
5272 | size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); | 5195 | ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", |
5273 | 5196 | RBD_HEADER_PREFIX, spec->image_id); | |
5274 | rbd_dev->header_name = kmalloc(size, GFP_KERNEL); | ||
5275 | if (!rbd_dev->header_name) | ||
5276 | return -ENOMEM; | ||
5277 | 5197 | ||
5278 | if (rbd_dev->image_format == 1) | 5198 | return ret; |
5279 | sprintf(rbd_dev->header_name, "%s%s", | ||
5280 | spec->image_name, RBD_SUFFIX); | ||
5281 | else | ||
5282 | sprintf(rbd_dev->header_name, "%s%s", | ||
5283 | RBD_HEADER_PREFIX, spec->image_id); | ||
5284 | return 0; | ||
5285 | } | 5199 | } |
5286 | 5200 | ||
5287 | static void rbd_dev_image_release(struct rbd_device *rbd_dev) | 5201 | static void rbd_dev_image_release(struct rbd_device *rbd_dev) |
5288 | { | 5202 | { |
5289 | rbd_dev_unprobe(rbd_dev); | 5203 | rbd_dev_unprobe(rbd_dev); |
5290 | kfree(rbd_dev->header_name); | ||
5291 | rbd_dev->header_name = NULL; | ||
5292 | rbd_dev->image_format = 0; | 5204 | rbd_dev->image_format = 0; |
5293 | kfree(rbd_dev->spec->image_id); | 5205 | kfree(rbd_dev->spec->image_id); |
5294 | rbd_dev->spec->image_id = NULL; | 5206 | rbd_dev->spec->image_id = NULL; |
@@ -5327,7 +5239,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) | |||
5327 | pr_info("image %s/%s does not exist\n", | 5239 | pr_info("image %s/%s does not exist\n", |
5328 | rbd_dev->spec->pool_name, | 5240 | rbd_dev->spec->pool_name, |
5329 | rbd_dev->spec->image_name); | 5241 | rbd_dev->spec->image_name); |
5330 | goto out_header_name; | 5242 | goto err_out_format; |
5331 | } | 5243 | } |
5332 | } | 5244 | } |
5333 | 5245 | ||
@@ -5373,7 +5285,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) | |||
5373 | goto err_out_probe; | 5285 | goto err_out_probe; |
5374 | 5286 | ||
5375 | dout("discovered format %u image, header name is %s\n", | 5287 | dout("discovered format %u image, header name is %s\n", |
5376 | rbd_dev->image_format, rbd_dev->header_name); | 5288 | rbd_dev->image_format, rbd_dev->header_oid.name); |
5377 | return 0; | 5289 | return 0; |
5378 | 5290 | ||
5379 | err_out_probe: | 5291 | err_out_probe: |
@@ -5381,9 +5293,6 @@ err_out_probe: | |||
5381 | err_out_watch: | 5293 | err_out_watch: |
5382 | if (!depth) | 5294 | if (!depth) |
5383 | rbd_dev_header_unwatch_sync(rbd_dev); | 5295 | rbd_dev_header_unwatch_sync(rbd_dev); |
5384 | out_header_name: | ||
5385 | kfree(rbd_dev->header_name); | ||
5386 | rbd_dev->header_name = NULL; | ||
5387 | err_out_format: | 5296 | err_out_format: |
5388 | rbd_dev->image_format = 0; | 5297 | rbd_dev->image_format = 0; |
5389 | kfree(rbd_dev->spec->image_id); | 5298 | kfree(rbd_dev->spec->image_id); |
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 43098cd9602b..eeb71e5de27a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page) | |||
257 | /* | 257 | /* |
258 | * Finish an async read(ahead) op. | 258 | * Finish an async read(ahead) op. |
259 | */ | 259 | */ |
260 | static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) | 260 | static void finish_read(struct ceph_osd_request *req) |
261 | { | 261 | { |
262 | struct inode *inode = req->r_inode; | 262 | struct inode *inode = req->r_inode; |
263 | struct ceph_osd_data *osd_data; | 263 | struct ceph_osd_data *osd_data; |
264 | int rc = req->r_result; | 264 | int rc = req->r_result <= 0 ? req->r_result : 0; |
265 | int bytes = le32_to_cpu(msg->hdr.data_len); | 265 | int bytes = req->r_result >= 0 ? req->r_result : 0; |
266 | int num_pages; | 266 | int num_pages; |
267 | int i; | 267 | int i; |
268 | 268 | ||
@@ -376,8 +376,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) | |||
376 | req->r_callback = finish_read; | 376 | req->r_callback = finish_read; |
377 | req->r_inode = inode; | 377 | req->r_inode = inode; |
378 | 378 | ||
379 | ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); | ||
380 | |||
381 | dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); | 379 | dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); |
382 | ret = ceph_osdc_start_request(osdc, req, false); | 380 | ret = ceph_osdc_start_request(osdc, req, false); |
383 | if (ret < 0) | 381 | if (ret < 0) |
@@ -546,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
546 | truncate_seq, truncate_size, | 544 | truncate_seq, truncate_size, |
547 | &inode->i_mtime, &page, 1); | 545 | &inode->i_mtime, &page, 1); |
548 | if (err < 0) { | 546 | if (err < 0) { |
549 | dout("writepage setting page/mapping error %d %p\n", err, page); | 547 | struct writeback_control tmp_wbc; |
548 | if (!wbc) | ||
549 | wbc = &tmp_wbc; | ||
550 | if (err == -ERESTARTSYS) { | ||
551 | /* killed by SIGKILL */ | ||
552 | dout("writepage interrupted page %p\n", page); | ||
553 | redirty_page_for_writepage(wbc, page); | ||
554 | end_page_writeback(page); | ||
555 | goto out; | ||
556 | } | ||
557 | dout("writepage setting page/mapping error %d %p\n", | ||
558 | err, page); | ||
550 | SetPageError(page); | 559 | SetPageError(page); |
551 | mapping_set_error(&inode->i_data, err); | 560 | mapping_set_error(&inode->i_data, err); |
552 | if (wbc) | 561 | wbc->pages_skipped++; |
553 | wbc->pages_skipped++; | ||
554 | } else { | 562 | } else { |
555 | dout("writepage cleaned page %p\n", page); | 563 | dout("writepage cleaned page %p\n", page); |
556 | err = 0; /* vfs expects us to return 0 */ | 564 | err = 0; /* vfs expects us to return 0 */ |
@@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) | |||
571 | BUG_ON(!inode); | 579 | BUG_ON(!inode); |
572 | ihold(inode); | 580 | ihold(inode); |
573 | err = writepage_nounlock(page, wbc); | 581 | err = writepage_nounlock(page, wbc); |
582 | if (err == -ERESTARTSYS) { | ||
583 | /* direct memory reclaimer was killed by SIGKILL. return 0 | ||
584 | * to prevent caller from setting mapping/page error */ | ||
585 | err = 0; | ||
586 | } | ||
574 | unlock_page(page); | 587 | unlock_page(page); |
575 | iput(inode); | 588 | iput(inode); |
576 | return err; | 589 | return err; |
577 | } | 590 | } |
578 | 591 | ||
579 | |||
580 | /* | 592 | /* |
581 | * lame release_pages helper. release_pages() isn't exported to | 593 | * lame release_pages helper. release_pages() isn't exported to |
582 | * modules. | 594 | * modules. |
@@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num) | |||
600 | * If we get an error, set the mapping error bit, but not the individual | 612 | * If we get an error, set the mapping error bit, but not the individual |
601 | * page error bits. | 613 | * page error bits. |
602 | */ | 614 | */ |
603 | static void writepages_finish(struct ceph_osd_request *req, | 615 | static void writepages_finish(struct ceph_osd_request *req) |
604 | struct ceph_msg *msg) | ||
605 | { | 616 | { |
606 | struct inode *inode = req->r_inode; | 617 | struct inode *inode = req->r_inode; |
607 | struct ceph_inode_info *ci = ceph_inode(inode); | 618 | struct ceph_inode_info *ci = ceph_inode(inode); |
@@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req, | |||
615 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 626 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
616 | bool remove_page; | 627 | bool remove_page; |
617 | 628 | ||
618 | |||
619 | dout("writepages_finish %p rc %d\n", inode, rc); | 629 | dout("writepages_finish %p rc %d\n", inode, rc); |
620 | if (rc < 0) | 630 | if (rc < 0) |
621 | mapping_set_error(mapping, rc); | 631 | mapping_set_error(mapping, rc); |
@@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req, | |||
650 | clear_bdi_congested(&fsc->backing_dev_info, | 660 | clear_bdi_congested(&fsc->backing_dev_info, |
651 | BLK_RW_ASYNC); | 661 | BLK_RW_ASYNC); |
652 | 662 | ||
663 | if (rc < 0) | ||
664 | SetPageError(page); | ||
665 | |||
653 | ceph_put_snap_context(page_snap_context(page)); | 666 | ceph_put_snap_context(page_snap_context(page)); |
654 | page->private = 0; | 667 | page->private = 0; |
655 | ClearPagePrivate(page); | 668 | ClearPagePrivate(page); |
@@ -718,8 +731,11 @@ static int ceph_writepages_start(struct address_space *mapping, | |||
718 | (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); | 731 | (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); |
719 | 732 | ||
720 | if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { | 733 | if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { |
721 | pr_warn("writepage_start %p on forced umount\n", inode); | 734 | if (ci->i_wrbuffer_ref > 0) { |
722 | truncate_pagecache(inode, 0); | 735 | pr_warn_ratelimited( |
736 | "writepage_start %p %lld forced umount\n", | ||
737 | inode, ceph_ino(inode)); | ||
738 | } | ||
723 | mapping_set_error(mapping, -EIO); | 739 | mapping_set_error(mapping, -EIO); |
724 | return -EIO; /* we're in a forced umount, don't write! */ | 740 | return -EIO; /* we're in a forced umount, don't write! */ |
725 | } | 741 | } |
@@ -1063,10 +1079,7 @@ new_request: | |||
1063 | pages = NULL; | 1079 | pages = NULL; |
1064 | } | 1080 | } |
1065 | 1081 | ||
1066 | vino = ceph_vino(inode); | 1082 | req->r_mtime = inode->i_mtime; |
1067 | ceph_osdc_build_request(req, offset, snapc, vino.snap, | ||
1068 | &inode->i_mtime); | ||
1069 | |||
1070 | rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); | 1083 | rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); |
1071 | BUG_ON(rc); | 1084 | BUG_ON(rc); |
1072 | req = NULL; | 1085 | req = NULL; |
@@ -1099,8 +1112,7 @@ release_pvec_pages: | |||
1099 | mapping->writeback_index = index; | 1112 | mapping->writeback_index = index; |
1100 | 1113 | ||
1101 | out: | 1114 | out: |
1102 | if (req) | 1115 | ceph_osdc_put_request(req); |
1103 | ceph_osdc_put_request(req); | ||
1104 | ceph_put_snap_context(snapc); | 1116 | ceph_put_snap_context(snapc); |
1105 | dout("writepages done, rc = %d\n", rc); | 1117 | dout("writepages done, rc = %d\n", rc); |
1106 | return rc; | 1118 | return rc; |
@@ -1134,6 +1146,7 @@ static int ceph_update_writeable_page(struct file *file, | |||
1134 | struct page *page) | 1146 | struct page *page) |
1135 | { | 1147 | { |
1136 | struct inode *inode = file_inode(file); | 1148 | struct inode *inode = file_inode(file); |
1149 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | ||
1137 | struct ceph_inode_info *ci = ceph_inode(inode); | 1150 | struct ceph_inode_info *ci = ceph_inode(inode); |
1138 | loff_t page_off = pos & PAGE_MASK; | 1151 | loff_t page_off = pos & PAGE_MASK; |
1139 | int pos_in_page = pos & ~PAGE_MASK; | 1152 | int pos_in_page = pos & ~PAGE_MASK; |
@@ -1142,6 +1155,12 @@ static int ceph_update_writeable_page(struct file *file, | |||
1142 | int r; | 1155 | int r; |
1143 | struct ceph_snap_context *snapc, *oldest; | 1156 | struct ceph_snap_context *snapc, *oldest; |
1144 | 1157 | ||
1158 | if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { | ||
1159 | dout(" page %p forced umount\n", page); | ||
1160 | unlock_page(page); | ||
1161 | return -EIO; | ||
1162 | } | ||
1163 | |||
1145 | retry_locked: | 1164 | retry_locked: |
1146 | /* writepages currently holds page lock, but if we change that later, */ | 1165 | /* writepages currently holds page lock, but if we change that later, */ |
1147 | wait_on_page_writeback(page); | 1166 | wait_on_page_writeback(page); |
@@ -1165,7 +1184,7 @@ retry_locked: | |||
1165 | snapc = ceph_get_snap_context(snapc); | 1184 | snapc = ceph_get_snap_context(snapc); |
1166 | unlock_page(page); | 1185 | unlock_page(page); |
1167 | ceph_queue_writeback(inode); | 1186 | ceph_queue_writeback(inode); |
1168 | r = wait_event_interruptible(ci->i_cap_wq, | 1187 | r = wait_event_killable(ci->i_cap_wq, |
1169 | context_is_writeable_or_written(inode, snapc)); | 1188 | context_is_writeable_or_written(inode, snapc)); |
1170 | ceph_put_snap_context(snapc); | 1189 | ceph_put_snap_context(snapc); |
1171 | if (r == -ERESTARTSYS) | 1190 | if (r == -ERESTARTSYS) |
@@ -1311,6 +1330,17 @@ const struct address_space_operations ceph_aops = { | |||
1311 | .direct_IO = ceph_direct_io, | 1330 | .direct_IO = ceph_direct_io, |
1312 | }; | 1331 | }; |
1313 | 1332 | ||
1333 | static void ceph_block_sigs(sigset_t *oldset) | ||
1334 | { | ||
1335 | sigset_t mask; | ||
1336 | siginitsetinv(&mask, sigmask(SIGKILL)); | ||
1337 | sigprocmask(SIG_BLOCK, &mask, oldset); | ||
1338 | } | ||
1339 | |||
1340 | static void ceph_restore_sigs(sigset_t *oldset) | ||
1341 | { | ||
1342 | sigprocmask(SIG_SETMASK, oldset, NULL); | ||
1343 | } | ||
1314 | 1344 | ||
1315 | /* | 1345 | /* |
1316 | * vm ops | 1346 | * vm ops |
@@ -1323,6 +1353,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1323 | struct page *pinned_page = NULL; | 1353 | struct page *pinned_page = NULL; |
1324 | loff_t off = vmf->pgoff << PAGE_SHIFT; | 1354 | loff_t off = vmf->pgoff << PAGE_SHIFT; |
1325 | int want, got, ret; | 1355 | int want, got, ret; |
1356 | sigset_t oldset; | ||
1357 | |||
1358 | ceph_block_sigs(&oldset); | ||
1326 | 1359 | ||
1327 | dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", | 1360 | dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", |
1328 | inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE); | 1361 | inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE); |
@@ -1330,17 +1363,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1330 | want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; | 1363 | want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; |
1331 | else | 1364 | else |
1332 | want = CEPH_CAP_FILE_CACHE; | 1365 | want = CEPH_CAP_FILE_CACHE; |
1333 | while (1) { | 1366 | |
1334 | got = 0; | 1367 | got = 0; |
1335 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, | 1368 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); |
1336 | -1, &got, &pinned_page); | 1369 | if (ret < 0) |
1337 | if (ret == 0) | 1370 | goto out_restore; |
1338 | break; | 1371 | |
1339 | if (ret != -ERESTARTSYS) { | ||
1340 | WARN_ON(1); | ||
1341 | return VM_FAULT_SIGBUS; | ||
1342 | } | ||
1343 | } | ||
1344 | dout("filemap_fault %p %llu~%zd got cap refs on %s\n", | 1372 | dout("filemap_fault %p %llu~%zd got cap refs on %s\n", |
1345 | inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); | 1373 | inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); |
1346 | 1374 | ||
@@ -1357,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1357 | ceph_put_cap_refs(ci, got); | 1385 | ceph_put_cap_refs(ci, got); |
1358 | 1386 | ||
1359 | if (ret != -EAGAIN) | 1387 | if (ret != -EAGAIN) |
1360 | return ret; | 1388 | goto out_restore; |
1361 | 1389 | ||
1362 | /* read inline data */ | 1390 | /* read inline data */ |
1363 | if (off >= PAGE_SIZE) { | 1391 | if (off >= PAGE_SIZE) { |
@@ -1371,15 +1399,18 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1371 | ~__GFP_FS)); | 1399 | ~__GFP_FS)); |
1372 | if (!page) { | 1400 | if (!page) { |
1373 | ret = VM_FAULT_OOM; | 1401 | ret = VM_FAULT_OOM; |
1374 | goto out; | 1402 | goto out_inline; |
1375 | } | 1403 | } |
1376 | ret1 = __ceph_do_getattr(inode, page, | 1404 | ret1 = __ceph_do_getattr(inode, page, |
1377 | CEPH_STAT_CAP_INLINE_DATA, true); | 1405 | CEPH_STAT_CAP_INLINE_DATA, true); |
1378 | if (ret1 < 0 || off >= i_size_read(inode)) { | 1406 | if (ret1 < 0 || off >= i_size_read(inode)) { |
1379 | unlock_page(page); | 1407 | unlock_page(page); |
1380 | put_page(page); | 1408 | put_page(page); |
1381 | ret = VM_FAULT_SIGBUS; | 1409 | if (ret1 < 0) |
1382 | goto out; | 1410 | ret = ret1; |
1411 | else | ||
1412 | ret = VM_FAULT_SIGBUS; | ||
1413 | goto out_inline; | ||
1383 | } | 1414 | } |
1384 | if (ret1 < PAGE_SIZE) | 1415 | if (ret1 < PAGE_SIZE) |
1385 | zero_user_segment(page, ret1, PAGE_SIZE); | 1416 | zero_user_segment(page, ret1, PAGE_SIZE); |
@@ -1388,10 +1419,15 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1388 | SetPageUptodate(page); | 1419 | SetPageUptodate(page); |
1389 | vmf->page = page; | 1420 | vmf->page = page; |
1390 | ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; | 1421 | ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; |
1422 | out_inline: | ||
1423 | dout("filemap_fault %p %llu~%zd read inline data ret %d\n", | ||
1424 | inode, off, (size_t)PAGE_SIZE, ret); | ||
1391 | } | 1425 | } |
1392 | out: | 1426 | out_restore: |
1393 | dout("filemap_fault %p %llu~%zd read inline data ret %d\n", | 1427 | ceph_restore_sigs(&oldset); |
1394 | inode, off, (size_t)PAGE_SIZE, ret); | 1428 | if (ret < 0) |
1429 | ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; | ||
1430 | |||
1395 | return ret; | 1431 | return ret; |
1396 | } | 1432 | } |
1397 | 1433 | ||
@@ -1409,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1409 | loff_t size = i_size_read(inode); | 1445 | loff_t size = i_size_read(inode); |
1410 | size_t len; | 1446 | size_t len; |
1411 | int want, got, ret; | 1447 | int want, got, ret; |
1448 | sigset_t oldset; | ||
1412 | 1449 | ||
1413 | prealloc_cf = ceph_alloc_cap_flush(); | 1450 | prealloc_cf = ceph_alloc_cap_flush(); |
1414 | if (!prealloc_cf) | 1451 | if (!prealloc_cf) |
1415 | return VM_FAULT_SIGBUS; | 1452 | return VM_FAULT_OOM; |
1453 | |||
1454 | ceph_block_sigs(&oldset); | ||
1416 | 1455 | ||
1417 | if (ci->i_inline_version != CEPH_INLINE_NONE) { | 1456 | if (ci->i_inline_version != CEPH_INLINE_NONE) { |
1418 | struct page *locked_page = NULL; | 1457 | struct page *locked_page = NULL; |
@@ -1423,10 +1462,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1423 | ret = ceph_uninline_data(vma->vm_file, locked_page); | 1462 | ret = ceph_uninline_data(vma->vm_file, locked_page); |
1424 | if (locked_page) | 1463 | if (locked_page) |
1425 | unlock_page(locked_page); | 1464 | unlock_page(locked_page); |
1426 | if (ret < 0) { | 1465 | if (ret < 0) |
1427 | ret = VM_FAULT_SIGBUS; | ||
1428 | goto out_free; | 1466 | goto out_free; |
1429 | } | ||
1430 | } | 1467 | } |
1431 | 1468 | ||
1432 | if (off + PAGE_SIZE <= size) | 1469 | if (off + PAGE_SIZE <= size) |
@@ -1440,45 +1477,36 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1440 | want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; | 1477 | want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; |
1441 | else | 1478 | else |
1442 | want = CEPH_CAP_FILE_BUFFER; | 1479 | want = CEPH_CAP_FILE_BUFFER; |
1443 | while (1) { | 1480 | |
1444 | got = 0; | 1481 | got = 0; |
1445 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, | 1482 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, |
1446 | &got, NULL); | 1483 | &got, NULL); |
1447 | if (ret == 0) | 1484 | if (ret < 0) |
1448 | break; | 1485 | goto out_free; |
1449 | if (ret != -ERESTARTSYS) { | 1486 | |
1450 | WARN_ON(1); | ||
1451 | ret = VM_FAULT_SIGBUS; | ||
1452 | goto out_free; | ||
1453 | } | ||
1454 | } | ||
1455 | dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", | 1487 | dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", |
1456 | inode, off, len, ceph_cap_string(got)); | 1488 | inode, off, len, ceph_cap_string(got)); |
1457 | 1489 | ||
1458 | /* Update time before taking page lock */ | 1490 | /* Update time before taking page lock */ |
1459 | file_update_time(vma->vm_file); | 1491 | file_update_time(vma->vm_file); |
1460 | 1492 | ||
1461 | lock_page(page); | 1493 | do { |
1494 | lock_page(page); | ||
1462 | 1495 | ||
1463 | ret = VM_FAULT_NOPAGE; | 1496 | if ((off > size) || (page->mapping != inode->i_mapping)) { |
1464 | if ((off > size) || | 1497 | unlock_page(page); |
1465 | (page->mapping != inode->i_mapping)) { | 1498 | ret = VM_FAULT_NOPAGE; |
1466 | unlock_page(page); | 1499 | break; |
1467 | goto out; | 1500 | } |
1468 | } | 1501 | |
1502 | ret = ceph_update_writeable_page(vma->vm_file, off, len, page); | ||
1503 | if (ret >= 0) { | ||
1504 | /* success. we'll keep the page locked. */ | ||
1505 | set_page_dirty(page); | ||
1506 | ret = VM_FAULT_LOCKED; | ||
1507 | } | ||
1508 | } while (ret == -EAGAIN); | ||
1469 | 1509 | ||
1470 | ret = ceph_update_writeable_page(vma->vm_file, off, len, page); | ||
1471 | if (ret >= 0) { | ||
1472 | /* success. we'll keep the page locked. */ | ||
1473 | set_page_dirty(page); | ||
1474 | ret = VM_FAULT_LOCKED; | ||
1475 | } else { | ||
1476 | if (ret == -ENOMEM) | ||
1477 | ret = VM_FAULT_OOM; | ||
1478 | else | ||
1479 | ret = VM_FAULT_SIGBUS; | ||
1480 | } | ||
1481 | out: | ||
1482 | if (ret == VM_FAULT_LOCKED || | 1510 | if (ret == VM_FAULT_LOCKED || |
1483 | ci->i_inline_version != CEPH_INLINE_NONE) { | 1511 | ci->i_inline_version != CEPH_INLINE_NONE) { |
1484 | int dirty; | 1512 | int dirty; |
@@ -1495,8 +1523,10 @@ out: | |||
1495 | inode, off, len, ceph_cap_string(got), ret); | 1523 | inode, off, len, ceph_cap_string(got), ret); |
1496 | ceph_put_cap_refs(ci, got); | 1524 | ceph_put_cap_refs(ci, got); |
1497 | out_free: | 1525 | out_free: |
1526 | ceph_restore_sigs(&oldset); | ||
1498 | ceph_free_cap_flush(prealloc_cf); | 1527 | ceph_free_cap_flush(prealloc_cf); |
1499 | 1528 | if (ret < 0) | |
1529 | ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; | ||
1500 | return ret; | 1530 | return ret; |
1501 | } | 1531 | } |
1502 | 1532 | ||
@@ -1614,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) | |||
1614 | goto out; | 1644 | goto out; |
1615 | } | 1645 | } |
1616 | 1646 | ||
1617 | ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); | 1647 | req->r_mtime = inode->i_mtime; |
1618 | err = ceph_osdc_start_request(&fsc->client->osdc, req, false); | 1648 | err = ceph_osdc_start_request(&fsc->client->osdc, req, false); |
1619 | if (!err) | 1649 | if (!err) |
1620 | err = ceph_osdc_wait_request(&fsc->client->osdc, req); | 1650 | err = ceph_osdc_wait_request(&fsc->client->osdc, req); |
@@ -1657,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) | |||
1657 | goto out_put; | 1687 | goto out_put; |
1658 | } | 1688 | } |
1659 | 1689 | ||
1660 | ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); | 1690 | req->r_mtime = inode->i_mtime; |
1661 | err = ceph_osdc_start_request(&fsc->client->osdc, req, false); | 1691 | err = ceph_osdc_start_request(&fsc->client->osdc, req, false); |
1662 | if (!err) | 1692 | if (!err) |
1663 | err = ceph_osdc_wait_request(&fsc->client->osdc, req); | 1693 | err = ceph_osdc_wait_request(&fsc->client->osdc, req); |
@@ -1758,9 +1788,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) | |||
1758 | rd_req->r_flags = CEPH_OSD_FLAG_READ; | 1788 | rd_req->r_flags = CEPH_OSD_FLAG_READ; |
1759 | osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); | 1789 | osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); |
1760 | rd_req->r_base_oloc.pool = pool; | 1790 | rd_req->r_base_oloc.pool = pool; |
1761 | snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name), | 1791 | ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino); |
1762 | "%llx.00000000", ci->i_vino.ino); | 1792 | |
1763 | rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); | 1793 | err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS); |
1794 | if (err) | ||
1795 | goto out_unlock; | ||
1764 | 1796 | ||
1765 | wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, | 1797 | wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, |
1766 | 1, false, GFP_NOFS); | 1798 | 1, false, GFP_NOFS); |
@@ -1769,11 +1801,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) | |||
1769 | goto out_unlock; | 1801 | goto out_unlock; |
1770 | } | 1802 | } |
1771 | 1803 | ||
1772 | wr_req->r_flags = CEPH_OSD_FLAG_WRITE | | 1804 | wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK; |
1773 | CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; | ||
1774 | osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); | 1805 | osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); |
1775 | wr_req->r_base_oloc.pool = pool; | 1806 | ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); |
1776 | wr_req->r_base_oid = rd_req->r_base_oid; | 1807 | ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); |
1808 | |||
1809 | err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS); | ||
1810 | if (err) | ||
1811 | goto out_unlock; | ||
1777 | 1812 | ||
1778 | /* one page should be large enough for STAT data */ | 1813 | /* one page should be large enough for STAT data */ |
1779 | pages = ceph_alloc_page_vector(1, GFP_KERNEL); | 1814 | pages = ceph_alloc_page_vector(1, GFP_KERNEL); |
@@ -1784,12 +1819,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) | |||
1784 | 1819 | ||
1785 | osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, | 1820 | osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, |
1786 | 0, false, true); | 1821 | 0, false, true); |
1787 | ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP, | ||
1788 | &ci->vfs_inode.i_mtime); | ||
1789 | err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); | 1822 | err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); |
1790 | 1823 | ||
1791 | ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP, | 1824 | wr_req->r_mtime = ci->vfs_inode.i_mtime; |
1792 | &ci->vfs_inode.i_mtime); | ||
1793 | err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); | 1825 | err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); |
1794 | 1826 | ||
1795 | if (!err) | 1827 | if (!err) |
@@ -1823,10 +1855,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) | |||
1823 | out_unlock: | 1855 | out_unlock: |
1824 | up_write(&mdsc->pool_perm_rwsem); | 1856 | up_write(&mdsc->pool_perm_rwsem); |
1825 | 1857 | ||
1826 | if (rd_req) | 1858 | ceph_osdc_put_request(rd_req); |
1827 | ceph_osdc_put_request(rd_req); | 1859 | ceph_osdc_put_request(wr_req); |
1828 | if (wr_req) | ||
1829 | ceph_osdc_put_request(wr_req); | ||
1830 | out: | 1860 | out: |
1831 | if (!err) | 1861 | if (!err) |
1832 | err = have; | 1862 | err = have; |
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a351480dbabc..c052b5bf219b 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c | |||
@@ -236,7 +236,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int | |||
236 | unlock_page(page); | 236 | unlock_page(page); |
237 | } | 237 | } |
238 | 238 | ||
239 | static inline int cache_valid(struct ceph_inode_info *ci) | 239 | static inline bool cache_valid(struct ceph_inode_info *ci) |
240 | { | 240 | { |
241 | return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && | 241 | return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && |
242 | (ci->i_fscache_gen == ci->i_rdcache_gen)); | 242 | (ci->i_fscache_gen == ci->i_rdcache_gen)); |
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index cfaeef18cbca..c17b5d76d75e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c | |||
@@ -1656,7 +1656,7 @@ retry_locked: | |||
1656 | */ | 1656 | */ |
1657 | if ((!is_delayed || mdsc->stopping) && | 1657 | if ((!is_delayed || mdsc->stopping) && |
1658 | !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ | 1658 | !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ |
1659 | ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ | 1659 | !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ |
1660 | inode->i_data.nrpages && /* have cached pages */ | 1660 | inode->i_data.nrpages && /* have cached pages */ |
1661 | (revoking & (CEPH_CAP_FILE_CACHE| | 1661 | (revoking & (CEPH_CAP_FILE_CACHE| |
1662 | CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ | 1662 | CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ |
@@ -1698,8 +1698,8 @@ retry_locked: | |||
1698 | 1698 | ||
1699 | revoking = cap->implemented & ~cap->issued; | 1699 | revoking = cap->implemented & ~cap->issued; |
1700 | dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", | 1700 | dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", |
1701 | cap->mds, cap, ceph_cap_string(cap->issued), | 1701 | cap->mds, cap, ceph_cap_string(cap_used), |
1702 | ceph_cap_string(cap_used), | 1702 | ceph_cap_string(cap->issued), |
1703 | ceph_cap_string(cap->implemented), | 1703 | ceph_cap_string(cap->implemented), |
1704 | ceph_cap_string(revoking)); | 1704 | ceph_cap_string(revoking)); |
1705 | 1705 | ||
@@ -2317,7 +2317,7 @@ again: | |||
2317 | 2317 | ||
2318 | /* make sure file is actually open */ | 2318 | /* make sure file is actually open */ |
2319 | file_wanted = __ceph_caps_file_wanted(ci); | 2319 | file_wanted = __ceph_caps_file_wanted(ci); |
2320 | if ((file_wanted & need) == 0) { | 2320 | if ((file_wanted & need) != need) { |
2321 | dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", | 2321 | dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", |
2322 | ceph_cap_string(need), ceph_cap_string(file_wanted)); | 2322 | ceph_cap_string(need), ceph_cap_string(file_wanted)); |
2323 | *err = -EBADF; | 2323 | *err = -EBADF; |
@@ -2412,12 +2412,26 @@ again: | |||
2412 | goto out_unlock; | 2412 | goto out_unlock; |
2413 | } | 2413 | } |
2414 | 2414 | ||
2415 | if (!__ceph_is_any_caps(ci) && | 2415 | if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { |
2416 | ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { | 2416 | int mds_wanted; |
2417 | dout("get_cap_refs %p forced umount\n", inode); | 2417 | if (ACCESS_ONCE(mdsc->fsc->mount_state) == |
2418 | *err = -EIO; | 2418 | CEPH_MOUNT_SHUTDOWN) { |
2419 | ret = 1; | 2419 | dout("get_cap_refs %p forced umount\n", inode); |
2420 | goto out_unlock; | 2420 | *err = -EIO; |
2421 | ret = 1; | ||
2422 | goto out_unlock; | ||
2423 | } | ||
2424 | mds_wanted = __ceph_caps_mds_wanted(ci); | ||
2425 | if ((mds_wanted & need) != need) { | ||
2426 | dout("get_cap_refs %p caps were dropped" | ||
2427 | " (session killed?)\n", inode); | ||
2428 | *err = -ESTALE; | ||
2429 | ret = 1; | ||
2430 | goto out_unlock; | ||
2431 | } | ||
2432 | if ((mds_wanted & file_wanted) == | ||
2433 | (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR))) | ||
2434 | ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; | ||
2421 | } | 2435 | } |
2422 | 2436 | ||
2423 | dout("get_cap_refs %p have %s needed %s\n", inode, | 2437 | dout("get_cap_refs %p have %s needed %s\n", inode, |
@@ -2487,7 +2501,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, | |||
2487 | if (err == -EAGAIN) | 2501 | if (err == -EAGAIN) |
2488 | continue; | 2502 | continue; |
2489 | if (err < 0) | 2503 | if (err < 0) |
2490 | return err; | 2504 | ret = err; |
2491 | } else { | 2505 | } else { |
2492 | ret = wait_event_interruptible(ci->i_cap_wq, | 2506 | ret = wait_event_interruptible(ci->i_cap_wq, |
2493 | try_get_cap_refs(ci, need, want, endoff, | 2507 | try_get_cap_refs(ci, need, want, endoff, |
@@ -2496,8 +2510,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, | |||
2496 | continue; | 2510 | continue; |
2497 | if (err < 0) | 2511 | if (err < 0) |
2498 | ret = err; | 2512 | ret = err; |
2499 | if (ret < 0) | 2513 | } |
2500 | return ret; | 2514 | if (ret < 0) { |
2515 | if (err == -ESTALE) { | ||
2516 | /* session was killed, try renew caps */ | ||
2517 | ret = ceph_renew_caps(&ci->vfs_inode); | ||
2518 | if (ret == 0) | ||
2519 | continue; | ||
2520 | } | ||
2521 | return ret; | ||
2501 | } | 2522 | } |
2502 | 2523 | ||
2503 | if (ci->i_inline_version != CEPH_INLINE_NONE && | 2524 | if (ci->i_inline_version != CEPH_INLINE_NONE && |
@@ -2807,7 +2828,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, | |||
2807 | if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ | 2828 | if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ |
2808 | ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && | 2829 | ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && |
2809 | (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && | 2830 | (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && |
2810 | !ci->i_wrbuffer_ref) { | 2831 | !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { |
2811 | if (try_nonblocking_invalidate(inode)) { | 2832 | if (try_nonblocking_invalidate(inode)) { |
2812 | /* there were locked pages.. invalidate later | 2833 | /* there were locked pages.. invalidate later |
2813 | in a separate thread. */ | 2834 | in a separate thread. */ |
@@ -3226,6 +3247,8 @@ retry: | |||
3226 | 3247 | ||
3227 | if (target < 0) { | 3248 | if (target < 0) { |
3228 | __ceph_remove_cap(cap, false); | 3249 | __ceph_remove_cap(cap, false); |
3250 | if (!ci->i_auth_cap) | ||
3251 | ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; | ||
3229 | goto out_unlock; | 3252 | goto out_unlock; |
3230 | } | 3253 | } |
3231 | 3254 | ||
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 31f831471ed2..39ff678e567f 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c | |||
@@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p) | |||
109 | path ? path : ""); | 109 | path ? path : ""); |
110 | spin_unlock(&req->r_old_dentry->d_lock); | 110 | spin_unlock(&req->r_old_dentry->d_lock); |
111 | kfree(path); | 111 | kfree(path); |
112 | } else if (req->r_path2) { | 112 | } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { |
113 | if (req->r_ino2.ino) | 113 | if (req->r_ino2.ino) |
114 | seq_printf(s, " #%llx/%s", req->r_ino2.ino, | 114 | seq_printf(s, " #%llx/%s", req->r_ino2.ino, |
115 | req->r_path2); | 115 | req->r_path2); |
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 3ab1192d2029..6e0fedf6713b 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c | |||
@@ -70,16 +70,42 @@ out_unlock: | |||
70 | } | 70 | } |
71 | 71 | ||
72 | /* | 72 | /* |
73 | * for readdir, we encode the directory frag and offset within that | 73 | * for f_pos for readdir: |
74 | * frag into f_pos. | 74 | * - hash order: |
75 | * (0xff << 52) | ((24 bits hash) << 28) | | ||
76 | * (the nth entry has hash collision); | ||
77 | * - frag+name order; | ||
78 | * ((frag value) << 28) | (the nth entry in frag); | ||
75 | */ | 79 | */ |
80 | #define OFFSET_BITS 28 | ||
81 | #define OFFSET_MASK ((1 << OFFSET_BITS) - 1) | ||
82 | #define HASH_ORDER (0xffull << (OFFSET_BITS + 24)) | ||
83 | loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order) | ||
84 | { | ||
85 | loff_t fpos = ((loff_t)high << 28) | (loff_t)off; | ||
86 | if (hash_order) | ||
87 | fpos |= HASH_ORDER; | ||
88 | return fpos; | ||
89 | } | ||
90 | |||
91 | static bool is_hash_order(loff_t p) | ||
92 | { | ||
93 | return (p & HASH_ORDER) == HASH_ORDER; | ||
94 | } | ||
95 | |||
76 | static unsigned fpos_frag(loff_t p) | 96 | static unsigned fpos_frag(loff_t p) |
77 | { | 97 | { |
78 | return p >> 32; | 98 | return p >> OFFSET_BITS; |
79 | } | 99 | } |
100 | |||
101 | static unsigned fpos_hash(loff_t p) | ||
102 | { | ||
103 | return ceph_frag_value(fpos_frag(p)); | ||
104 | } | ||
105 | |||
80 | static unsigned fpos_off(loff_t p) | 106 | static unsigned fpos_off(loff_t p) |
81 | { | 107 | { |
82 | return p & 0xffffffff; | 108 | return p & OFFSET_MASK; |
83 | } | 109 | } |
84 | 110 | ||
85 | static int fpos_cmp(loff_t l, loff_t r) | 111 | static int fpos_cmp(loff_t l, loff_t r) |
@@ -111,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name, | |||
111 | return 0; | 137 | return 0; |
112 | } | 138 | } |
113 | 139 | ||
140 | |||
141 | static struct dentry * | ||
142 | __dcache_find_get_entry(struct dentry *parent, u64 idx, | ||
143 | struct ceph_readdir_cache_control *cache_ctl) | ||
144 | { | ||
145 | struct inode *dir = d_inode(parent); | ||
146 | struct dentry *dentry; | ||
147 | unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1; | ||
148 | loff_t ptr_pos = idx * sizeof(struct dentry *); | ||
149 | pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT; | ||
150 | |||
151 | if (ptr_pos >= i_size_read(dir)) | ||
152 | return NULL; | ||
153 | |||
154 | if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) { | ||
155 | ceph_readdir_cache_release(cache_ctl); | ||
156 | cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); | ||
157 | if (!cache_ctl->page) { | ||
158 | dout(" page %lu not found\n", ptr_pgoff); | ||
159 | return ERR_PTR(-EAGAIN); | ||
160 | } | ||
161 | /* reading/filling the cache are serialized by | ||
162 | i_mutex, no need to use page lock */ | ||
163 | unlock_page(cache_ctl->page); | ||
164 | cache_ctl->dentries = kmap(cache_ctl->page); | ||
165 | } | ||
166 | |||
167 | cache_ctl->index = idx & idx_mask; | ||
168 | |||
169 | rcu_read_lock(); | ||
170 | spin_lock(&parent->d_lock); | ||
171 | /* check i_size again here, because empty directory can be | ||
172 | * marked as complete while not holding the i_mutex. */ | ||
173 | if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) | ||
174 | dentry = cache_ctl->dentries[cache_ctl->index]; | ||
175 | else | ||
176 | dentry = NULL; | ||
177 | spin_unlock(&parent->d_lock); | ||
178 | if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) | ||
179 | dentry = NULL; | ||
180 | rcu_read_unlock(); | ||
181 | return dentry ? : ERR_PTR(-EAGAIN); | ||
182 | } | ||
183 | |||
114 | /* | 184 | /* |
115 | * When possible, we try to satisfy a readdir by peeking at the | 185 | * When possible, we try to satisfy a readdir by peeking at the |
116 | * dcache. We make this work by carefully ordering dentries on | 186 | * dcache. We make this work by carefully ordering dentries on |
@@ -130,75 +200,68 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, | |||
130 | struct inode *dir = d_inode(parent); | 200 | struct inode *dir = d_inode(parent); |
131 | struct dentry *dentry, *last = NULL; | 201 | struct dentry *dentry, *last = NULL; |
132 | struct ceph_dentry_info *di; | 202 | struct ceph_dentry_info *di; |
133 | unsigned nsize = PAGE_SIZE / sizeof(struct dentry *); | ||
134 | int err = 0; | ||
135 | loff_t ptr_pos = 0; | ||
136 | struct ceph_readdir_cache_control cache_ctl = {}; | 203 | struct ceph_readdir_cache_control cache_ctl = {}; |
204 | u64 idx = 0; | ||
205 | int err = 0; | ||
137 | 206 | ||
138 | dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); | 207 | dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos); |
208 | |||
209 | /* search start position */ | ||
210 | if (ctx->pos > 2) { | ||
211 | u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *)); | ||
212 | while (count > 0) { | ||
213 | u64 step = count >> 1; | ||
214 | dentry = __dcache_find_get_entry(parent, idx + step, | ||
215 | &cache_ctl); | ||
216 | if (!dentry) { | ||
217 | /* use linar search */ | ||
218 | idx = 0; | ||
219 | break; | ||
220 | } | ||
221 | if (IS_ERR(dentry)) { | ||
222 | err = PTR_ERR(dentry); | ||
223 | goto out; | ||
224 | } | ||
225 | di = ceph_dentry(dentry); | ||
226 | spin_lock(&dentry->d_lock); | ||
227 | if (fpos_cmp(di->offset, ctx->pos) < 0) { | ||
228 | idx += step + 1; | ||
229 | count -= step + 1; | ||
230 | } else { | ||
231 | count = step; | ||
232 | } | ||
233 | spin_unlock(&dentry->d_lock); | ||
234 | dput(dentry); | ||
235 | } | ||
139 | 236 | ||
140 | /* we can calculate cache index for the first dirfrag */ | 237 | dout("__dcache_readdir %p cache idx %llu\n", dir, idx); |
141 | if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) { | ||
142 | cache_ctl.index = fpos_off(ctx->pos) - 2; | ||
143 | BUG_ON(cache_ctl.index < 0); | ||
144 | ptr_pos = cache_ctl.index * sizeof(struct dentry *); | ||
145 | } | 238 | } |
146 | 239 | ||
147 | while (true) { | ||
148 | pgoff_t pgoff; | ||
149 | bool emit_dentry; | ||
150 | 240 | ||
151 | if (ptr_pos >= i_size_read(dir)) { | 241 | for (;;) { |
242 | bool emit_dentry = false; | ||
243 | dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl); | ||
244 | if (!dentry) { | ||
152 | fi->flags |= CEPH_F_ATEND; | 245 | fi->flags |= CEPH_F_ATEND; |
153 | err = 0; | 246 | err = 0; |
154 | break; | 247 | break; |
155 | } | 248 | } |
156 | 249 | if (IS_ERR(dentry)) { | |
157 | err = -EAGAIN; | 250 | err = PTR_ERR(dentry); |
158 | pgoff = ptr_pos >> PAGE_SHIFT; | 251 | goto out; |
159 | if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) { | ||
160 | ceph_readdir_cache_release(&cache_ctl); | ||
161 | cache_ctl.page = find_lock_page(&dir->i_data, pgoff); | ||
162 | if (!cache_ctl.page) { | ||
163 | dout(" page %lu not found\n", pgoff); | ||
164 | break; | ||
165 | } | ||
166 | /* reading/filling the cache are serialized by | ||
167 | * i_mutex, no need to use page lock */ | ||
168 | unlock_page(cache_ctl.page); | ||
169 | cache_ctl.dentries = kmap(cache_ctl.page); | ||
170 | } | 252 | } |
171 | 253 | ||
172 | rcu_read_lock(); | ||
173 | spin_lock(&parent->d_lock); | ||
174 | /* check i_size again here, because empty directory can be | ||
175 | * marked as complete while not holding the i_mutex. */ | ||
176 | if (ceph_dir_is_complete_ordered(dir) && | ||
177 | ptr_pos < i_size_read(dir)) | ||
178 | dentry = cache_ctl.dentries[cache_ctl.index % nsize]; | ||
179 | else | ||
180 | dentry = NULL; | ||
181 | spin_unlock(&parent->d_lock); | ||
182 | if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) | ||
183 | dentry = NULL; | ||
184 | rcu_read_unlock(); | ||
185 | if (!dentry) | ||
186 | break; | ||
187 | |||
188 | emit_dentry = false; | ||
189 | di = ceph_dentry(dentry); | 254 | di = ceph_dentry(dentry); |
190 | spin_lock(&dentry->d_lock); | 255 | spin_lock(&dentry->d_lock); |
191 | if (di->lease_shared_gen == shared_gen && | 256 | if (di->lease_shared_gen == shared_gen && |
192 | d_really_is_positive(dentry) && | 257 | d_really_is_positive(dentry) && |
193 | ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && | ||
194 | ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && | ||
195 | fpos_cmp(ctx->pos, di->offset) <= 0) { | 258 | fpos_cmp(ctx->pos, di->offset) <= 0) { |
196 | emit_dentry = true; | 259 | emit_dentry = true; |
197 | } | 260 | } |
198 | spin_unlock(&dentry->d_lock); | 261 | spin_unlock(&dentry->d_lock); |
199 | 262 | ||
200 | if (emit_dentry) { | 263 | if (emit_dentry) { |
201 | dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, | 264 | dout(" %llx dentry %p %pd %p\n", di->offset, |
202 | dentry, dentry, d_inode(dentry)); | 265 | dentry, dentry, d_inode(dentry)); |
203 | ctx->pos = di->offset; | 266 | ctx->pos = di->offset; |
204 | if (!dir_emit(ctx, dentry->d_name.name, | 267 | if (!dir_emit(ctx, dentry->d_name.name, |
@@ -218,10 +281,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, | |||
218 | } else { | 281 | } else { |
219 | dput(dentry); | 282 | dput(dentry); |
220 | } | 283 | } |
221 | |||
222 | cache_ctl.index++; | ||
223 | ptr_pos += sizeof(struct dentry *); | ||
224 | } | 284 | } |
285 | out: | ||
225 | ceph_readdir_cache_release(&cache_ctl); | 286 | ceph_readdir_cache_release(&cache_ctl); |
226 | if (last) { | 287 | if (last) { |
227 | int ret; | 288 | int ret; |
@@ -235,6 +296,16 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, | |||
235 | return err; | 296 | return err; |
236 | } | 297 | } |
237 | 298 | ||
299 | static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) | ||
300 | { | ||
301 | if (!fi->last_readdir) | ||
302 | return true; | ||
303 | if (is_hash_order(pos)) | ||
304 | return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); | ||
305 | else | ||
306 | return fi->frag != fpos_frag(pos); | ||
307 | } | ||
308 | |||
238 | static int ceph_readdir(struct file *file, struct dir_context *ctx) | 309 | static int ceph_readdir(struct file *file, struct dir_context *ctx) |
239 | { | 310 | { |
240 | struct ceph_file_info *fi = file->private_data; | 311 | struct ceph_file_info *fi = file->private_data; |
@@ -242,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) | |||
242 | struct ceph_inode_info *ci = ceph_inode(inode); | 313 | struct ceph_inode_info *ci = ceph_inode(inode); |
243 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 314 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
244 | struct ceph_mds_client *mdsc = fsc->mdsc; | 315 | struct ceph_mds_client *mdsc = fsc->mdsc; |
245 | unsigned frag = fpos_frag(ctx->pos); | 316 | int i; |
246 | int off = fpos_off(ctx->pos); | ||
247 | int err; | 317 | int err; |
248 | u32 ftype; | 318 | u32 ftype; |
249 | struct ceph_mds_reply_info_parsed *rinfo; | 319 | struct ceph_mds_reply_info_parsed *rinfo; |
250 | 320 | ||
251 | dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); | 321 | dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); |
252 | if (fi->flags & CEPH_F_ATEND) | 322 | if (fi->flags & CEPH_F_ATEND) |
253 | return 0; | 323 | return 0; |
254 | 324 | ||
@@ -260,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) | |||
260 | inode->i_mode >> 12)) | 330 | inode->i_mode >> 12)) |
261 | return 0; | 331 | return 0; |
262 | ctx->pos = 1; | 332 | ctx->pos = 1; |
263 | off = 1; | ||
264 | } | 333 | } |
265 | if (ctx->pos == 1) { | 334 | if (ctx->pos == 1) { |
266 | ino_t ino = parent_ino(file->f_path.dentry); | 335 | ino_t ino = parent_ino(file->f_path.dentry); |
@@ -270,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) | |||
270 | inode->i_mode >> 12)) | 339 | inode->i_mode >> 12)) |
271 | return 0; | 340 | return 0; |
272 | ctx->pos = 2; | 341 | ctx->pos = 2; |
273 | off = 2; | ||
274 | } | 342 | } |
275 | 343 | ||
276 | /* can we use the dcache? */ | 344 | /* can we use the dcache? */ |
@@ -285,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) | |||
285 | err = __dcache_readdir(file, ctx, shared_gen); | 353 | err = __dcache_readdir(file, ctx, shared_gen); |
286 | if (err != -EAGAIN) | 354 | if (err != -EAGAIN) |
287 | return err; | 355 | return err; |
288 | frag = fpos_frag(ctx->pos); | ||
289 | off = fpos_off(ctx->pos); | ||
290 | } else { | 356 | } else { |
291 | spin_unlock(&ci->i_ceph_lock); | 357 | spin_unlock(&ci->i_ceph_lock); |
292 | } | 358 | } |
@@ -294,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) | |||
294 | /* proceed with a normal readdir */ | 360 | /* proceed with a normal readdir */ |
295 | more: | 361 | more: |
296 | /* do we have the correct frag content buffered? */ | 362 | /* do we have the correct frag content buffered? */ |
297 | if (fi->frag != frag || fi->last_readdir == NULL) { | 363 | if (need_send_readdir(fi, ctx->pos)) { |
298 | struct ceph_mds_request *req; | 364 | struct ceph_mds_request *req; |
365 | unsigned frag; | ||
299 | int op = ceph_snap(inode) == CEPH_SNAPDIR ? | 366 | int op = ceph_snap(inode) == CEPH_SNAPDIR ? |
300 | CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; | 367 | CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; |
301 | 368 | ||
@@ -305,6 +372,13 @@ more: | |||
305 | fi->last_readdir = NULL; | 372 | fi->last_readdir = NULL; |
306 | } | 373 | } |
307 | 374 | ||
375 | if (is_hash_order(ctx->pos)) { | ||
376 | frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), | ||
377 | NULL, NULL); | ||
378 | } else { | ||
379 | frag = fpos_frag(ctx->pos); | ||
380 | } | ||
381 | |||
308 | dout("readdir fetching %llx.%llx frag %x offset '%s'\n", | 382 | dout("readdir fetching %llx.%llx frag %x offset '%s'\n", |
309 | ceph_vinop(inode), frag, fi->last_name); | 383 | ceph_vinop(inode), frag, fi->last_name); |
310 | req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); | 384 | req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); |
@@ -331,6 +405,8 @@ more: | |||
331 | req->r_readdir_cache_idx = fi->readdir_cache_idx; | 405 | req->r_readdir_cache_idx = fi->readdir_cache_idx; |
332 | req->r_readdir_offset = fi->next_offset; | 406 | req->r_readdir_offset = fi->next_offset; |
333 | req->r_args.readdir.frag = cpu_to_le32(frag); | 407 | req->r_args.readdir.frag = cpu_to_le32(frag); |
408 | req->r_args.readdir.flags = | ||
409 | cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); | ||
334 | 410 | ||
335 | req->r_inode = inode; | 411 | req->r_inode = inode; |
336 | ihold(inode); | 412 | ihold(inode); |
@@ -340,22 +416,26 @@ more: | |||
340 | ceph_mdsc_put_request(req); | 416 | ceph_mdsc_put_request(req); |
341 | return err; | 417 | return err; |
342 | } | 418 | } |
343 | dout("readdir got and parsed readdir result=%d" | 419 | dout("readdir got and parsed readdir result=%d on " |
344 | " on frag %x, end=%d, complete=%d\n", err, frag, | 420 | "frag %x, end=%d, complete=%d, hash_order=%d\n", |
421 | err, frag, | ||
345 | (int)req->r_reply_info.dir_end, | 422 | (int)req->r_reply_info.dir_end, |
346 | (int)req->r_reply_info.dir_complete); | 423 | (int)req->r_reply_info.dir_complete, |
347 | 424 | (int)req->r_reply_info.hash_order); | |
348 | 425 | ||
349 | /* note next offset and last dentry name */ | ||
350 | rinfo = &req->r_reply_info; | 426 | rinfo = &req->r_reply_info; |
351 | if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { | 427 | if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { |
352 | frag = le32_to_cpu(rinfo->dir_dir->frag); | 428 | frag = le32_to_cpu(rinfo->dir_dir->frag); |
353 | off = req->r_readdir_offset; | 429 | if (!rinfo->hash_order) { |
354 | fi->next_offset = off; | 430 | fi->next_offset = req->r_readdir_offset; |
431 | /* adjust ctx->pos to beginning of frag */ | ||
432 | ctx->pos = ceph_make_fpos(frag, | ||
433 | fi->next_offset, | ||
434 | false); | ||
435 | } | ||
355 | } | 436 | } |
356 | 437 | ||
357 | fi->frag = frag; | 438 | fi->frag = frag; |
358 | fi->offset = fi->next_offset; | ||
359 | fi->last_readdir = req; | 439 | fi->last_readdir = req; |
360 | 440 | ||
361 | if (req->r_did_prepopulate) { | 441 | if (req->r_did_prepopulate) { |
@@ -363,7 +443,8 @@ more: | |||
363 | if (fi->readdir_cache_idx < 0) { | 443 | if (fi->readdir_cache_idx < 0) { |
364 | /* preclude from marking dir ordered */ | 444 | /* preclude from marking dir ordered */ |
365 | fi->dir_ordered_count = 0; | 445 | fi->dir_ordered_count = 0; |
366 | } else if (ceph_frag_is_leftmost(frag) && off == 2) { | 446 | } else if (ceph_frag_is_leftmost(frag) && |
447 | fi->next_offset == 2) { | ||
367 | /* note dir version at start of readdir so | 448 | /* note dir version at start of readdir so |
368 | * we can tell if any dentries get dropped */ | 449 | * we can tell if any dentries get dropped */ |
369 | fi->dir_release_count = req->r_dir_release_cnt; | 450 | fi->dir_release_count = req->r_dir_release_cnt; |
@@ -377,65 +458,87 @@ more: | |||
377 | fi->dir_release_count = 0; | 458 | fi->dir_release_count = 0; |
378 | } | 459 | } |
379 | 460 | ||
380 | if (req->r_reply_info.dir_end) { | 461 | /* note next offset and last dentry name */ |
381 | kfree(fi->last_name); | 462 | if (rinfo->dir_nr > 0) { |
382 | fi->last_name = NULL; | 463 | struct ceph_mds_reply_dir_entry *rde = |
383 | if (ceph_frag_is_rightmost(frag)) | 464 | rinfo->dir_entries + (rinfo->dir_nr-1); |
384 | fi->next_offset = 2; | 465 | unsigned next_offset = req->r_reply_info.dir_end ? |
385 | else | 466 | 2 : (fpos_off(rde->offset) + 1); |
386 | fi->next_offset = 0; | 467 | err = note_last_dentry(fi, rde->name, rde->name_len, |
387 | } else { | 468 | next_offset); |
388 | err = note_last_dentry(fi, | ||
389 | rinfo->dir_dname[rinfo->dir_nr-1], | ||
390 | rinfo->dir_dname_len[rinfo->dir_nr-1], | ||
391 | fi->next_offset + rinfo->dir_nr); | ||
392 | if (err) | 469 | if (err) |
393 | return err; | 470 | return err; |
471 | } else if (req->r_reply_info.dir_end) { | ||
472 | fi->next_offset = 2; | ||
473 | /* keep last name */ | ||
394 | } | 474 | } |
395 | } | 475 | } |
396 | 476 | ||
397 | rinfo = &fi->last_readdir->r_reply_info; | 477 | rinfo = &fi->last_readdir->r_reply_info; |
398 | dout("readdir frag %x num %d off %d chunkoff %d\n", frag, | 478 | dout("readdir frag %x num %d pos %llx chunk first %llx\n", |
399 | rinfo->dir_nr, off, fi->offset); | 479 | fi->frag, rinfo->dir_nr, ctx->pos, |
400 | 480 | rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); | |
401 | ctx->pos = ceph_make_fpos(frag, off); | 481 | |
402 | while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { | 482 | i = 0; |
403 | struct ceph_mds_reply_inode *in = | 483 | /* search start position */ |
404 | rinfo->dir_in[off - fi->offset].in; | 484 | if (rinfo->dir_nr > 0) { |
485 | int step, nr = rinfo->dir_nr; | ||
486 | while (nr > 0) { | ||
487 | step = nr >> 1; | ||
488 | if (rinfo->dir_entries[i + step].offset < ctx->pos) { | ||
489 | i += step + 1; | ||
490 | nr -= step + 1; | ||
491 | } else { | ||
492 | nr = step; | ||
493 | } | ||
494 | } | ||
495 | } | ||
496 | for (; i < rinfo->dir_nr; i++) { | ||
497 | struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; | ||
405 | struct ceph_vino vino; | 498 | struct ceph_vino vino; |
406 | ino_t ino; | 499 | ino_t ino; |
407 | 500 | ||
408 | dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", | 501 | BUG_ON(rde->offset < ctx->pos); |
409 | off, off - fi->offset, rinfo->dir_nr, ctx->pos, | 502 | |
410 | rinfo->dir_dname_len[off - fi->offset], | 503 | ctx->pos = rde->offset; |
411 | rinfo->dir_dname[off - fi->offset], in); | 504 | dout("readdir (%d/%d) -> %llx '%.*s' %p\n", |
412 | BUG_ON(!in); | 505 | i, rinfo->dir_nr, ctx->pos, |
413 | ftype = le32_to_cpu(in->mode) >> 12; | 506 | rde->name_len, rde->name, &rde->inode.in); |
414 | vino.ino = le64_to_cpu(in->ino); | 507 | |
415 | vino.snap = le64_to_cpu(in->snapid); | 508 | BUG_ON(!rde->inode.in); |
509 | ftype = le32_to_cpu(rde->inode.in->mode) >> 12; | ||
510 | vino.ino = le64_to_cpu(rde->inode.in->ino); | ||
511 | vino.snap = le64_to_cpu(rde->inode.in->snapid); | ||
416 | ino = ceph_vino_to_ino(vino); | 512 | ino = ceph_vino_to_ino(vino); |
417 | if (!dir_emit(ctx, | 513 | |
418 | rinfo->dir_dname[off - fi->offset], | 514 | if (!dir_emit(ctx, rde->name, rde->name_len, |
419 | rinfo->dir_dname_len[off - fi->offset], | 515 | ceph_translate_ino(inode->i_sb, ino), ftype)) { |
420 | ceph_translate_ino(inode->i_sb, ino), ftype)) { | ||
421 | dout("filldir stopping us...\n"); | 516 | dout("filldir stopping us...\n"); |
422 | return 0; | 517 | return 0; |
423 | } | 518 | } |
424 | off++; | ||
425 | ctx->pos++; | 519 | ctx->pos++; |
426 | } | 520 | } |
427 | 521 | ||
428 | if (fi->last_name) { | 522 | if (fi->next_offset > 2) { |
429 | ceph_mdsc_put_request(fi->last_readdir); | 523 | ceph_mdsc_put_request(fi->last_readdir); |
430 | fi->last_readdir = NULL; | 524 | fi->last_readdir = NULL; |
431 | goto more; | 525 | goto more; |
432 | } | 526 | } |
433 | 527 | ||
434 | /* more frags? */ | 528 | /* more frags? */ |
435 | if (!ceph_frag_is_rightmost(frag)) { | 529 | if (!ceph_frag_is_rightmost(fi->frag)) { |
436 | frag = ceph_frag_next(frag); | 530 | unsigned frag = ceph_frag_next(fi->frag); |
437 | off = 0; | 531 | if (is_hash_order(ctx->pos)) { |
438 | ctx->pos = ceph_make_fpos(frag, off); | 532 | loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), |
533 | fi->next_offset, true); | ||
534 | if (new_pos > ctx->pos) | ||
535 | ctx->pos = new_pos; | ||
536 | /* keep last_name */ | ||
537 | } else { | ||
538 | ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); | ||
539 | kfree(fi->last_name); | ||
540 | fi->last_name = NULL; | ||
541 | } | ||
439 | dout("readdir next frag is %x\n", frag); | 542 | dout("readdir next frag is %x\n", frag); |
440 | goto more; | 543 | goto more; |
441 | } | 544 | } |
@@ -467,7 +570,7 @@ more: | |||
467 | return 0; | 570 | return 0; |
468 | } | 571 | } |
469 | 572 | ||
470 | static void reset_readdir(struct ceph_file_info *fi, unsigned frag) | 573 | static void reset_readdir(struct ceph_file_info *fi) |
471 | { | 574 | { |
472 | if (fi->last_readdir) { | 575 | if (fi->last_readdir) { |
473 | ceph_mdsc_put_request(fi->last_readdir); | 576 | ceph_mdsc_put_request(fi->last_readdir); |
@@ -477,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag) | |||
477 | fi->last_name = NULL; | 580 | fi->last_name = NULL; |
478 | fi->dir_release_count = 0; | 581 | fi->dir_release_count = 0; |
479 | fi->readdir_cache_idx = -1; | 582 | fi->readdir_cache_idx = -1; |
480 | if (ceph_frag_is_leftmost(frag)) | 583 | fi->next_offset = 2; /* compensate for . and .. */ |
481 | fi->next_offset = 2; /* compensate for . and .. */ | ||
482 | else | ||
483 | fi->next_offset = 0; | ||
484 | fi->flags &= ~CEPH_F_ATEND; | 584 | fi->flags &= ~CEPH_F_ATEND; |
485 | } | 585 | } |
486 | 586 | ||
587 | /* | ||
588 | * discard buffered readdir content on seekdir(0), or seek to new frag, | ||
589 | * or seek prior to current chunk | ||
590 | */ | ||
591 | static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) | ||
592 | { | ||
593 | struct ceph_mds_reply_info_parsed *rinfo; | ||
594 | loff_t chunk_offset; | ||
595 | if (new_pos == 0) | ||
596 | return true; | ||
597 | if (is_hash_order(new_pos)) { | ||
598 | /* no need to reset last_name for a forward seek when | ||
599 | * dentries are sotred in hash order */ | ||
600 | } else if (fi->frag |= fpos_frag(new_pos)) { | ||
601 | return true; | ||
602 | } | ||
603 | rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; | ||
604 | if (!rinfo || !rinfo->dir_nr) | ||
605 | return true; | ||
606 | chunk_offset = rinfo->dir_entries[0].offset; | ||
607 | return new_pos < chunk_offset || | ||
608 | is_hash_order(new_pos) != is_hash_order(chunk_offset); | ||
609 | } | ||
610 | |||
487 | static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) | 611 | static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) |
488 | { | 612 | { |
489 | struct ceph_file_info *fi = file->private_data; | 613 | struct ceph_file_info *fi = file->private_data; |
490 | struct inode *inode = file->f_mapping->host; | 614 | struct inode *inode = file->f_mapping->host; |
491 | loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); | ||
492 | loff_t retval; | 615 | loff_t retval; |
493 | 616 | ||
494 | inode_lock(inode); | 617 | inode_lock(inode); |
@@ -505,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) | |||
505 | } | 628 | } |
506 | 629 | ||
507 | if (offset >= 0) { | 630 | if (offset >= 0) { |
631 | if (need_reset_readdir(fi, offset)) { | ||
632 | dout("dir_llseek dropping %p content\n", file); | ||
633 | reset_readdir(fi); | ||
634 | } else if (is_hash_order(offset) && offset > file->f_pos) { | ||
635 | /* for hash offset, we don't know if a forward seek | ||
636 | * is within same frag */ | ||
637 | fi->dir_release_count = 0; | ||
638 | fi->readdir_cache_idx = -1; | ||
639 | } | ||
640 | |||
508 | if (offset != file->f_pos) { | 641 | if (offset != file->f_pos) { |
509 | file->f_pos = offset; | 642 | file->f_pos = offset; |
510 | file->f_version = 0; | 643 | file->f_version = 0; |
511 | fi->flags &= ~CEPH_F_ATEND; | 644 | fi->flags &= ~CEPH_F_ATEND; |
512 | } | 645 | } |
513 | retval = offset; | 646 | retval = offset; |
514 | |||
515 | if (offset == 0 || | ||
516 | fpos_frag(offset) != fi->frag || | ||
517 | fpos_off(offset) < fi->offset) { | ||
518 | /* discard buffered readdir content on seekdir(0), or | ||
519 | * seek to new frag, or seek prior to current chunk */ | ||
520 | dout("dir_llseek dropping %p content\n", file); | ||
521 | reset_readdir(fi, fpos_frag(offset)); | ||
522 | } else if (fpos_cmp(offset, old_offset) > 0) { | ||
523 | /* reset dir_release_count if we did a forward seek */ | ||
524 | fi->dir_release_count = 0; | ||
525 | fi->readdir_cache_idx = -1; | ||
526 | } | ||
527 | } | 647 | } |
528 | out: | 648 | out: |
529 | inode_unlock(inode); | 649 | inode_unlock(inode); |
@@ -591,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, | |||
591 | return dentry; | 711 | return dentry; |
592 | } | 712 | } |
593 | 713 | ||
594 | static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) | 714 | static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) |
595 | { | 715 | { |
596 | return ceph_ino(inode) == CEPH_INO_ROOT && | 716 | return ceph_ino(inode) == CEPH_INO_ROOT && |
597 | strncmp(dentry->d_name.name, ".ceph", 5) == 0; | 717 | strncmp(dentry->d_name.name, ".ceph", 5) == 0; |
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4f1dc7120916..a888df6f2d71 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
@@ -192,6 +192,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) | |||
192 | } | 192 | } |
193 | 193 | ||
194 | /* | 194 | /* |
195 | * try renew caps after session gets killed. | ||
196 | */ | ||
197 | int ceph_renew_caps(struct inode *inode) | ||
198 | { | ||
199 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | ||
200 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
201 | struct ceph_mds_request *req; | ||
202 | int err, flags, wanted; | ||
203 | |||
204 | spin_lock(&ci->i_ceph_lock); | ||
205 | wanted = __ceph_caps_file_wanted(ci); | ||
206 | if (__ceph_is_any_real_caps(ci) && | ||
207 | (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) { | ||
208 | int issued = __ceph_caps_issued(ci, NULL); | ||
209 | spin_unlock(&ci->i_ceph_lock); | ||
210 | dout("renew caps %p want %s issued %s updating mds_wanted\n", | ||
211 | inode, ceph_cap_string(wanted), ceph_cap_string(issued)); | ||
212 | ceph_check_caps(ci, 0, NULL); | ||
213 | return 0; | ||
214 | } | ||
215 | spin_unlock(&ci->i_ceph_lock); | ||
216 | |||
217 | flags = 0; | ||
218 | if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR)) | ||
219 | flags = O_RDWR; | ||
220 | else if (wanted & CEPH_CAP_FILE_RD) | ||
221 | flags = O_RDONLY; | ||
222 | else if (wanted & CEPH_CAP_FILE_WR) | ||
223 | flags = O_WRONLY; | ||
224 | #ifdef O_LAZY | ||
225 | if (wanted & CEPH_CAP_FILE_LAZYIO) | ||
226 | flags |= O_LAZY; | ||
227 | #endif | ||
228 | |||
229 | req = prepare_open_request(inode->i_sb, flags, 0); | ||
230 | if (IS_ERR(req)) { | ||
231 | err = PTR_ERR(req); | ||
232 | goto out; | ||
233 | } | ||
234 | |||
235 | req->r_inode = inode; | ||
236 | ihold(inode); | ||
237 | req->r_num_caps = 1; | ||
238 | req->r_fmode = -1; | ||
239 | |||
240 | err = ceph_mdsc_do_request(mdsc, NULL, req); | ||
241 | ceph_mdsc_put_request(req); | ||
242 | out: | ||
243 | dout("renew caps %p open result=%d\n", inode, err); | ||
244 | return err < 0 ? err : 0; | ||
245 | } | ||
246 | |||
247 | /* | ||
195 | * If we already have the requisite capabilities, we can satisfy | 248 | * If we already have the requisite capabilities, we can satisfy |
196 | * the open request locally (no need to request new caps from the | 249 | * the open request locally (no need to request new caps from the |
197 | * MDS). We do, however, need to inform the MDS (asynchronously) | 250 | * MDS). We do, however, need to inform the MDS (asynchronously) |
@@ -616,8 +669,7 @@ static void ceph_aio_complete(struct inode *inode, | |||
616 | kfree(aio_req); | 669 | kfree(aio_req); |
617 | } | 670 | } |
618 | 671 | ||
619 | static void ceph_aio_complete_req(struct ceph_osd_request *req, | 672 | static void ceph_aio_complete_req(struct ceph_osd_request *req) |
620 | struct ceph_msg *msg) | ||
621 | { | 673 | { |
622 | int rc = req->r_result; | 674 | int rc = req->r_result; |
623 | struct inode *inode = req->r_inode; | 675 | struct inode *inode = req->r_inode; |
@@ -714,14 +766,21 @@ static void ceph_aio_retry_work(struct work_struct *work) | |||
714 | req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | | 766 | req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | |
715 | CEPH_OSD_FLAG_ONDISK | | 767 | CEPH_OSD_FLAG_ONDISK | |
716 | CEPH_OSD_FLAG_WRITE; | 768 | CEPH_OSD_FLAG_WRITE; |
717 | req->r_base_oloc = orig_req->r_base_oloc; | 769 | ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); |
718 | req->r_base_oid = orig_req->r_base_oid; | 770 | ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); |
771 | |||
772 | ret = ceph_osdc_alloc_messages(req, GFP_NOFS); | ||
773 | if (ret) { | ||
774 | ceph_osdc_put_request(req); | ||
775 | req = orig_req; | ||
776 | goto out; | ||
777 | } | ||
719 | 778 | ||
720 | req->r_ops[0] = orig_req->r_ops[0]; | 779 | req->r_ops[0] = orig_req->r_ops[0]; |
721 | osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); | 780 | osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); |
722 | 781 | ||
723 | ceph_osdc_build_request(req, req->r_ops[0].extent.offset, | 782 | req->r_mtime = aio_req->mtime; |
724 | snapc, CEPH_NOSNAP, &aio_req->mtime); | 783 | req->r_data_offset = req->r_ops[0].extent.offset; |
725 | 784 | ||
726 | ceph_osdc_put_request(orig_req); | 785 | ceph_osdc_put_request(orig_req); |
727 | 786 | ||
@@ -733,7 +792,7 @@ static void ceph_aio_retry_work(struct work_struct *work) | |||
733 | out: | 792 | out: |
734 | if (ret < 0) { | 793 | if (ret < 0) { |
735 | req->r_result = ret; | 794 | req->r_result = ret; |
736 | ceph_aio_complete_req(req, NULL); | 795 | ceph_aio_complete_req(req); |
737 | } | 796 | } |
738 | 797 | ||
739 | ceph_put_snap_context(snapc); | 798 | ceph_put_snap_context(snapc); |
@@ -764,6 +823,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) | |||
764 | list_add_tail(&req->r_unsafe_item, | 823 | list_add_tail(&req->r_unsafe_item, |
765 | &ci->i_unsafe_writes); | 824 | &ci->i_unsafe_writes); |
766 | spin_unlock(&ci->i_unsafe_lock); | 825 | spin_unlock(&ci->i_unsafe_lock); |
826 | |||
827 | complete_all(&req->r_completion); | ||
767 | } else { | 828 | } else { |
768 | spin_lock(&ci->i_unsafe_lock); | 829 | spin_lock(&ci->i_unsafe_lock); |
769 | list_del_init(&req->r_unsafe_item); | 830 | list_del_init(&req->r_unsafe_item); |
@@ -875,14 +936,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, | |||
875 | (pos+len) | (PAGE_SIZE - 1)); | 936 | (pos+len) | (PAGE_SIZE - 1)); |
876 | 937 | ||
877 | osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); | 938 | osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); |
939 | req->r_mtime = mtime; | ||
878 | } | 940 | } |
879 | 941 | ||
880 | |||
881 | osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, | 942 | osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, |
882 | false, false); | 943 | false, false); |
883 | 944 | ||
884 | ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); | ||
885 | |||
886 | if (aio_req) { | 945 | if (aio_req) { |
887 | aio_req->total_len += len; | 946 | aio_req->total_len += len; |
888 | aio_req->num_reqs++; | 947 | aio_req->num_reqs++; |
@@ -956,7 +1015,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, | |||
956 | req, false); | 1015 | req, false); |
957 | if (ret < 0) { | 1016 | if (ret < 0) { |
958 | req->r_result = ret; | 1017 | req->r_result = ret; |
959 | ceph_aio_complete_req(req, NULL); | 1018 | ceph_aio_complete_req(req); |
960 | } | 1019 | } |
961 | } | 1020 | } |
962 | return -EIOCBQUEUED; | 1021 | return -EIOCBQUEUED; |
@@ -1067,9 +1126,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, | |||
1067 | osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, | 1126 | osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, |
1068 | false, true); | 1127 | false, true); |
1069 | 1128 | ||
1070 | /* BUG_ON(vino.snap != CEPH_NOSNAP); */ | 1129 | req->r_mtime = mtime; |
1071 | ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); | ||
1072 | |||
1073 | ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); | 1130 | ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); |
1074 | if (!ret) | 1131 | if (!ret) |
1075 | ret = ceph_osdc_wait_request(&fsc->client->osdc, req); | 1132 | ret = ceph_osdc_wait_request(&fsc->client->osdc, req); |
@@ -1524,9 +1581,7 @@ static int ceph_zero_partial_object(struct inode *inode, | |||
1524 | goto out; | 1581 | goto out; |
1525 | } | 1582 | } |
1526 | 1583 | ||
1527 | ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, | 1584 | req->r_mtime = inode->i_mtime; |
1528 | &inode->i_mtime); | ||
1529 | |||
1530 | ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); | 1585 | ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); |
1531 | if (!ret) { | 1586 | if (!ret) { |
1532 | ret = ceph_osdc_wait_request(&fsc->client->osdc, req); | 1587 | ret = ceph_osdc_wait_request(&fsc->client->osdc, req); |
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e669cfa9d793..f059b5997072 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/xattr.h> | 11 | #include <linux/xattr.h> |
12 | #include <linux/posix_acl.h> | 12 | #include <linux/posix_acl.h> |
13 | #include <linux/random.h> | 13 | #include <linux/random.h> |
14 | #include <linux/sort.h> | ||
14 | 15 | ||
15 | #include "super.h" | 16 | #include "super.h" |
16 | #include "mds_client.h" | 17 | #include "mds_client.h" |
@@ -254,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode, | |||
254 | diri_auth = ci->i_auth_cap->mds; | 255 | diri_auth = ci->i_auth_cap->mds; |
255 | spin_unlock(&ci->i_ceph_lock); | 256 | spin_unlock(&ci->i_ceph_lock); |
256 | 257 | ||
258 | if (mds == -1) /* CDIR_AUTH_PARENT */ | ||
259 | mds = diri_auth; | ||
260 | |||
257 | mutex_lock(&ci->i_fragtree_mutex); | 261 | mutex_lock(&ci->i_fragtree_mutex); |
258 | if (ndist == 0 && mds == diri_auth) { | 262 | if (ndist == 0 && mds == diri_auth) { |
259 | /* no delegation info needed. */ | 263 | /* no delegation info needed. */ |
@@ -300,20 +304,38 @@ out: | |||
300 | return err; | 304 | return err; |
301 | } | 305 | } |
302 | 306 | ||
307 | static int frag_tree_split_cmp(const void *l, const void *r) | ||
308 | { | ||
309 | struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l; | ||
310 | struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r; | ||
311 | return ceph_frag_compare(ls->frag, rs->frag); | ||
312 | } | ||
313 | |||
314 | static bool is_frag_child(u32 f, struct ceph_inode_frag *frag) | ||
315 | { | ||
316 | if (!frag) | ||
317 | return f == ceph_frag_make(0, 0); | ||
318 | if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by) | ||
319 | return false; | ||
320 | return ceph_frag_contains_value(frag->frag, ceph_frag_value(f)); | ||
321 | } | ||
322 | |||
303 | static int ceph_fill_fragtree(struct inode *inode, | 323 | static int ceph_fill_fragtree(struct inode *inode, |
304 | struct ceph_frag_tree_head *fragtree, | 324 | struct ceph_frag_tree_head *fragtree, |
305 | struct ceph_mds_reply_dirfrag *dirinfo) | 325 | struct ceph_mds_reply_dirfrag *dirinfo) |
306 | { | 326 | { |
307 | struct ceph_inode_info *ci = ceph_inode(inode); | 327 | struct ceph_inode_info *ci = ceph_inode(inode); |
308 | struct ceph_inode_frag *frag; | 328 | struct ceph_inode_frag *frag, *prev_frag = NULL; |
309 | struct rb_node *rb_node; | 329 | struct rb_node *rb_node; |
310 | int i; | 330 | unsigned i, split_by, nsplits; |
311 | u32 id, nsplits; | 331 | u32 id; |
312 | bool update = false; | 332 | bool update = false; |
313 | 333 | ||
314 | mutex_lock(&ci->i_fragtree_mutex); | 334 | mutex_lock(&ci->i_fragtree_mutex); |
315 | nsplits = le32_to_cpu(fragtree->nsplits); | 335 | nsplits = le32_to_cpu(fragtree->nsplits); |
316 | if (nsplits) { | 336 | if (nsplits != ci->i_fragtree_nsplits) { |
337 | update = true; | ||
338 | } else if (nsplits) { | ||
317 | i = prandom_u32() % nsplits; | 339 | i = prandom_u32() % nsplits; |
318 | id = le32_to_cpu(fragtree->splits[i].frag); | 340 | id = le32_to_cpu(fragtree->splits[i].frag); |
319 | if (!__ceph_find_frag(ci, id)) | 341 | if (!__ceph_find_frag(ci, id)) |
@@ -332,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode, | |||
332 | if (!update) | 354 | if (!update) |
333 | goto out_unlock; | 355 | goto out_unlock; |
334 | 356 | ||
357 | if (nsplits > 1) { | ||
358 | sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]), | ||
359 | frag_tree_split_cmp, NULL); | ||
360 | } | ||
361 | |||
335 | dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); | 362 | dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); |
336 | rb_node = rb_first(&ci->i_fragtree); | 363 | rb_node = rb_first(&ci->i_fragtree); |
337 | for (i = 0; i < nsplits; i++) { | 364 | for (i = 0; i < nsplits; i++) { |
338 | id = le32_to_cpu(fragtree->splits[i].frag); | 365 | id = le32_to_cpu(fragtree->splits[i].frag); |
366 | split_by = le32_to_cpu(fragtree->splits[i].by); | ||
367 | if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) { | ||
368 | pr_err("fill_fragtree %llx.%llx invalid split %d/%u, " | ||
369 | "frag %x split by %d\n", ceph_vinop(inode), | ||
370 | i, nsplits, id, split_by); | ||
371 | continue; | ||
372 | } | ||
339 | frag = NULL; | 373 | frag = NULL; |
340 | while (rb_node) { | 374 | while (rb_node) { |
341 | frag = rb_entry(rb_node, struct ceph_inode_frag, node); | 375 | frag = rb_entry(rb_node, struct ceph_inode_frag, node); |
@@ -347,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode, | |||
347 | break; | 381 | break; |
348 | } | 382 | } |
349 | rb_node = rb_next(rb_node); | 383 | rb_node = rb_next(rb_node); |
350 | rb_erase(&frag->node, &ci->i_fragtree); | 384 | /* delete stale split/leaf node */ |
351 | kfree(frag); | 385 | if (frag->split_by > 0 || |
386 | !is_frag_child(frag->frag, prev_frag)) { | ||
387 | rb_erase(&frag->node, &ci->i_fragtree); | ||
388 | if (frag->split_by > 0) | ||
389 | ci->i_fragtree_nsplits--; | ||
390 | kfree(frag); | ||
391 | } | ||
352 | frag = NULL; | 392 | frag = NULL; |
353 | } | 393 | } |
354 | if (!frag) { | 394 | if (!frag) { |
@@ -356,14 +396,23 @@ static int ceph_fill_fragtree(struct inode *inode, | |||
356 | if (IS_ERR(frag)) | 396 | if (IS_ERR(frag)) |
357 | continue; | 397 | continue; |
358 | } | 398 | } |
359 | frag->split_by = le32_to_cpu(fragtree->splits[i].by); | 399 | if (frag->split_by == 0) |
400 | ci->i_fragtree_nsplits++; | ||
401 | frag->split_by = split_by; | ||
360 | dout(" frag %x split by %d\n", frag->frag, frag->split_by); | 402 | dout(" frag %x split by %d\n", frag->frag, frag->split_by); |
403 | prev_frag = frag; | ||
361 | } | 404 | } |
362 | while (rb_node) { | 405 | while (rb_node) { |
363 | frag = rb_entry(rb_node, struct ceph_inode_frag, node); | 406 | frag = rb_entry(rb_node, struct ceph_inode_frag, node); |
364 | rb_node = rb_next(rb_node); | 407 | rb_node = rb_next(rb_node); |
365 | rb_erase(&frag->node, &ci->i_fragtree); | 408 | /* delete stale split/leaf node */ |
366 | kfree(frag); | 409 | if (frag->split_by > 0 || |
410 | !is_frag_child(frag->frag, prev_frag)) { | ||
411 | rb_erase(&frag->node, &ci->i_fragtree); | ||
412 | if (frag->split_by > 0) | ||
413 | ci->i_fragtree_nsplits--; | ||
414 | kfree(frag); | ||
415 | } | ||
367 | } | 416 | } |
368 | out_unlock: | 417 | out_unlock: |
369 | mutex_unlock(&ci->i_fragtree_mutex); | 418 | mutex_unlock(&ci->i_fragtree_mutex); |
@@ -513,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode) | |||
513 | rb_erase(n, &ci->i_fragtree); | 562 | rb_erase(n, &ci->i_fragtree); |
514 | kfree(frag); | 563 | kfree(frag); |
515 | } | 564 | } |
565 | ci->i_fragtree_nsplits = 0; | ||
516 | 566 | ||
517 | __ceph_destroy_xattrs(ci); | 567 | __ceph_destroy_xattrs(ci); |
518 | if (ci->i_xattrs.blob) | 568 | if (ci->i_xattrs.blob) |
@@ -533,6 +583,11 @@ int ceph_drop_inode(struct inode *inode) | |||
533 | return 1; | 583 | return 1; |
534 | } | 584 | } |
535 | 585 | ||
586 | static inline blkcnt_t calc_inode_blocks(u64 size) | ||
587 | { | ||
588 | return (size + (1<<9) - 1) >> 9; | ||
589 | } | ||
590 | |||
536 | /* | 591 | /* |
537 | * Helpers to fill in size, ctime, mtime, and atime. We have to be | 592 | * Helpers to fill in size, ctime, mtime, and atime. We have to be |
538 | * careful because either the client or MDS may have more up to date | 593 | * careful because either the client or MDS may have more up to date |
@@ -555,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, | |||
555 | size = 0; | 610 | size = 0; |
556 | } | 611 | } |
557 | i_size_write(inode, size); | 612 | i_size_write(inode, size); |
558 | inode->i_blocks = (size + (1<<9) - 1) >> 9; | 613 | inode->i_blocks = calc_inode_blocks(size); |
559 | ci->i_reported_size = size; | 614 | ci->i_reported_size = size; |
560 | if (truncate_seq != ci->i_truncate_seq) { | 615 | if (truncate_seq != ci->i_truncate_seq) { |
561 | dout("truncate_seq %u -> %u\n", | 616 | dout("truncate_seq %u -> %u\n", |
@@ -814,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page, | |||
814 | 869 | ||
815 | spin_unlock(&ci->i_ceph_lock); | 870 | spin_unlock(&ci->i_ceph_lock); |
816 | 871 | ||
817 | err = -EINVAL; | 872 | if (symlen != i_size_read(inode)) { |
818 | if (WARN_ON(symlen != i_size_read(inode))) | 873 | pr_err("fill_inode %llx.%llx BAD symlink " |
819 | goto out; | 874 | "size %lld\n", ceph_vinop(inode), |
875 | i_size_read(inode)); | ||
876 | i_size_write(inode, symlen); | ||
877 | inode->i_blocks = calc_inode_blocks(symlen); | ||
878 | } | ||
820 | 879 | ||
821 | err = -ENOMEM; | 880 | err = -ENOMEM; |
822 | sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); | 881 | sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); |
@@ -1309,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, | |||
1309 | int i, err = 0; | 1368 | int i, err = 0; |
1310 | 1369 | ||
1311 | for (i = 0; i < rinfo->dir_nr; i++) { | 1370 | for (i = 0; i < rinfo->dir_nr; i++) { |
1371 | struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; | ||
1312 | struct ceph_vino vino; | 1372 | struct ceph_vino vino; |
1313 | struct inode *in; | 1373 | struct inode *in; |
1314 | int rc; | 1374 | int rc; |
1315 | 1375 | ||
1316 | vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); | 1376 | vino.ino = le64_to_cpu(rde->inode.in->ino); |
1317 | vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); | 1377 | vino.snap = le64_to_cpu(rde->inode.in->snapid); |
1318 | 1378 | ||
1319 | in = ceph_get_inode(req->r_dentry->d_sb, vino); | 1379 | in = ceph_get_inode(req->r_dentry->d_sb, vino); |
1320 | if (IS_ERR(in)) { | 1380 | if (IS_ERR(in)) { |
@@ -1322,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, | |||
1322 | dout("new_inode badness got %d\n", err); | 1382 | dout("new_inode badness got %d\n", err); |
1323 | continue; | 1383 | continue; |
1324 | } | 1384 | } |
1325 | rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, | 1385 | rc = fill_inode(in, NULL, &rde->inode, NULL, session, |
1326 | req->r_request_started, -1, | 1386 | req->r_request_started, -1, |
1327 | &req->r_caps_reservation); | 1387 | &req->r_caps_reservation); |
1328 | if (rc < 0) { | 1388 | if (rc < 0) { |
1329 | pr_err("fill_inode badness on %p got %d\n", in, rc); | 1389 | pr_err("fill_inode badness on %p got %d\n", in, rc); |
1330 | err = rc; | 1390 | err = rc; |
1331 | continue; | ||
1332 | } | 1391 | } |
1392 | iput(in); | ||
1333 | } | 1393 | } |
1334 | 1394 | ||
1335 | return err; | 1395 | return err; |
@@ -1387,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, | |||
1387 | struct ceph_mds_session *session) | 1447 | struct ceph_mds_session *session) |
1388 | { | 1448 | { |
1389 | struct dentry *parent = req->r_dentry; | 1449 | struct dentry *parent = req->r_dentry; |
1450 | struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); | ||
1390 | struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; | 1451 | struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; |
1391 | struct qstr dname; | 1452 | struct qstr dname; |
1392 | struct dentry *dn; | 1453 | struct dentry *dn; |
@@ -1394,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, | |||
1394 | int err = 0, skipped = 0, ret, i; | 1455 | int err = 0, skipped = 0, ret, i; |
1395 | struct inode *snapdir = NULL; | 1456 | struct inode *snapdir = NULL; |
1396 | struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; | 1457 | struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; |
1397 | struct ceph_dentry_info *di; | ||
1398 | u32 frag = le32_to_cpu(rhead->args.readdir.frag); | 1458 | u32 frag = le32_to_cpu(rhead->args.readdir.frag); |
1459 | u32 last_hash = 0; | ||
1460 | u32 fpos_offset; | ||
1399 | struct ceph_readdir_cache_control cache_ctl = {}; | 1461 | struct ceph_readdir_cache_control cache_ctl = {}; |
1400 | 1462 | ||
1401 | if (req->r_aborted) | 1463 | if (req->r_aborted) |
1402 | return readdir_prepopulate_inodes_only(req, session); | 1464 | return readdir_prepopulate_inodes_only(req, session); |
1403 | 1465 | ||
1466 | if (rinfo->hash_order && req->r_path2) { | ||
1467 | last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, | ||
1468 | req->r_path2, strlen(req->r_path2)); | ||
1469 | last_hash = ceph_frag_value(last_hash); | ||
1470 | } | ||
1471 | |||
1404 | if (rinfo->dir_dir && | 1472 | if (rinfo->dir_dir && |
1405 | le32_to_cpu(rinfo->dir_dir->frag) != frag) { | 1473 | le32_to_cpu(rinfo->dir_dir->frag) != frag) { |
1406 | dout("readdir_prepopulate got new frag %x -> %x\n", | 1474 | dout("readdir_prepopulate got new frag %x -> %x\n", |
1407 | frag, le32_to_cpu(rinfo->dir_dir->frag)); | 1475 | frag, le32_to_cpu(rinfo->dir_dir->frag)); |
1408 | frag = le32_to_cpu(rinfo->dir_dir->frag); | 1476 | frag = le32_to_cpu(rinfo->dir_dir->frag); |
1409 | if (ceph_frag_is_leftmost(frag)) | 1477 | if (!rinfo->hash_order) |
1410 | req->r_readdir_offset = 2; | 1478 | req->r_readdir_offset = 2; |
1411 | else | ||
1412 | req->r_readdir_offset = 0; | ||
1413 | } | 1479 | } |
1414 | 1480 | ||
1415 | if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { | 1481 | if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { |
@@ -1427,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, | |||
1427 | if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { | 1493 | if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { |
1428 | /* note dir version at start of readdir so we can tell | 1494 | /* note dir version at start of readdir so we can tell |
1429 | * if any dentries get dropped */ | 1495 | * if any dentries get dropped */ |
1430 | struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); | ||
1431 | req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); | 1496 | req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); |
1432 | req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); | 1497 | req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); |
1433 | req->r_readdir_cache_idx = 0; | 1498 | req->r_readdir_cache_idx = 0; |
1434 | } | 1499 | } |
1435 | 1500 | ||
1436 | cache_ctl.index = req->r_readdir_cache_idx; | 1501 | cache_ctl.index = req->r_readdir_cache_idx; |
1502 | fpos_offset = req->r_readdir_offset; | ||
1437 | 1503 | ||
1438 | /* FIXME: release caps/leases if error occurs */ | 1504 | /* FIXME: release caps/leases if error occurs */ |
1439 | for (i = 0; i < rinfo->dir_nr; i++) { | 1505 | for (i = 0; i < rinfo->dir_nr; i++) { |
1506 | struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; | ||
1440 | struct ceph_vino vino; | 1507 | struct ceph_vino vino; |
1441 | 1508 | ||
1442 | dname.name = rinfo->dir_dname[i]; | 1509 | dname.name = rde->name; |
1443 | dname.len = rinfo->dir_dname_len[i]; | 1510 | dname.len = rde->name_len; |
1444 | dname.hash = full_name_hash(dname.name, dname.len); | 1511 | dname.hash = full_name_hash(dname.name, dname.len); |
1445 | 1512 | ||
1446 | vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); | 1513 | vino.ino = le64_to_cpu(rde->inode.in->ino); |
1447 | vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); | 1514 | vino.snap = le64_to_cpu(rde->inode.in->snapid); |
1515 | |||
1516 | if (rinfo->hash_order) { | ||
1517 | u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, | ||
1518 | rde->name, rde->name_len); | ||
1519 | hash = ceph_frag_value(hash); | ||
1520 | if (hash != last_hash) | ||
1521 | fpos_offset = 2; | ||
1522 | last_hash = hash; | ||
1523 | rde->offset = ceph_make_fpos(hash, fpos_offset++, true); | ||
1524 | } else { | ||
1525 | rde->offset = ceph_make_fpos(frag, fpos_offset++, false); | ||
1526 | } | ||
1448 | 1527 | ||
1449 | retry_lookup: | 1528 | retry_lookup: |
1450 | dn = d_lookup(parent, &dname); | 1529 | dn = d_lookup(parent, &dname); |
@@ -1490,7 +1569,7 @@ retry_lookup: | |||
1490 | } | 1569 | } |
1491 | } | 1570 | } |
1492 | 1571 | ||
1493 | ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, | 1572 | ret = fill_inode(in, NULL, &rde->inode, NULL, session, |
1494 | req->r_request_started, -1, | 1573 | req->r_request_started, -1, |
1495 | &req->r_caps_reservation); | 1574 | &req->r_caps_reservation); |
1496 | if (ret < 0) { | 1575 | if (ret < 0) { |
@@ -1523,11 +1602,9 @@ retry_lookup: | |||
1523 | dn = realdn; | 1602 | dn = realdn; |
1524 | } | 1603 | } |
1525 | 1604 | ||
1526 | di = dn->d_fsdata; | 1605 | ceph_dentry(dn)->offset = rde->offset; |
1527 | di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); | ||
1528 | 1606 | ||
1529 | update_dentry_lease(dn, rinfo->dir_dlease[i], | 1607 | update_dentry_lease(dn, rde->lease, req->r_session, |
1530 | req->r_session, | ||
1531 | req->r_request_started); | 1608 | req->r_request_started); |
1532 | 1609 | ||
1533 | if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { | 1610 | if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { |
@@ -1562,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) | |||
1562 | spin_lock(&ci->i_ceph_lock); | 1639 | spin_lock(&ci->i_ceph_lock); |
1563 | dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); | 1640 | dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); |
1564 | i_size_write(inode, size); | 1641 | i_size_write(inode, size); |
1565 | inode->i_blocks = (size + (1 << 9) - 1) >> 9; | 1642 | inode->i_blocks = calc_inode_blocks(size); |
1566 | 1643 | ||
1567 | /* tell the MDS if we are approaching max_size */ | 1644 | /* tell the MDS if we are approaching max_size */ |
1568 | if ((size << 1) >= ci->i_max_size && | 1645 | if ((size << 1) >= ci->i_max_size && |
@@ -1624,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work) | |||
1624 | struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, | 1701 | struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, |
1625 | i_pg_inv_work); | 1702 | i_pg_inv_work); |
1626 | struct inode *inode = &ci->vfs_inode; | 1703 | struct inode *inode = &ci->vfs_inode; |
1704 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | ||
1627 | u32 orig_gen; | 1705 | u32 orig_gen; |
1628 | int check = 0; | 1706 | int check = 0; |
1629 | 1707 | ||
1630 | mutex_lock(&ci->i_truncate_mutex); | 1708 | mutex_lock(&ci->i_truncate_mutex); |
1709 | |||
1710 | if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { | ||
1711 | pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", | ||
1712 | inode, ceph_ino(inode)); | ||
1713 | mapping_set_error(inode->i_mapping, -EIO); | ||
1714 | truncate_pagecache(inode, 0); | ||
1715 | mutex_unlock(&ci->i_truncate_mutex); | ||
1716 | goto out; | ||
1717 | } | ||
1718 | |||
1631 | spin_lock(&ci->i_ceph_lock); | 1719 | spin_lock(&ci->i_ceph_lock); |
1632 | dout("invalidate_pages %p gen %d revoking %d\n", inode, | 1720 | dout("invalidate_pages %p gen %d revoking %d\n", inode, |
1633 | ci->i_rdcache_gen, ci->i_rdcache_revoking); | 1721 | ci->i_rdcache_gen, ci->i_rdcache_revoking); |
@@ -1641,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work) | |||
1641 | orig_gen = ci->i_rdcache_gen; | 1729 | orig_gen = ci->i_rdcache_gen; |
1642 | spin_unlock(&ci->i_ceph_lock); | 1730 | spin_unlock(&ci->i_ceph_lock); |
1643 | 1731 | ||
1644 | truncate_pagecache(inode, 0); | 1732 | if (invalidate_inode_pages2(inode->i_mapping) < 0) { |
1733 | pr_err("invalidate_pages %p fails\n", inode); | ||
1734 | } | ||
1645 | 1735 | ||
1646 | spin_lock(&ci->i_ceph_lock); | 1736 | spin_lock(&ci->i_ceph_lock); |
1647 | if (orig_gen == ci->i_rdcache_gen && | 1737 | if (orig_gen == ci->i_rdcache_gen && |
@@ -1920,8 +2010,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) | |||
1920 | if ((issued & CEPH_CAP_FILE_EXCL) && | 2010 | if ((issued & CEPH_CAP_FILE_EXCL) && |
1921 | attr->ia_size > inode->i_size) { | 2011 | attr->ia_size > inode->i_size) { |
1922 | i_size_write(inode, attr->ia_size); | 2012 | i_size_write(inode, attr->ia_size); |
1923 | inode->i_blocks = | 2013 | inode->i_blocks = calc_inode_blocks(attr->ia_size); |
1924 | (attr->ia_size + (1 << 9) - 1) >> 9; | ||
1925 | inode->i_ctime = attr->ia_ctime; | 2014 | inode->i_ctime = attr->ia_ctime; |
1926 | ci->i_reported_size = attr->ia_size; | 2015 | ci->i_reported_size = attr->ia_size; |
1927 | dirtied |= CEPH_CAP_FILE_EXCL; | 2016 | dirtied |= CEPH_CAP_FILE_EXCL; |
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index f851d8d70158..be6b1657b1af 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c | |||
@@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) | |||
193 | if (copy_from_user(&dl, arg, sizeof(dl))) | 193 | if (copy_from_user(&dl, arg, sizeof(dl))) |
194 | return -EFAULT; | 194 | return -EFAULT; |
195 | 195 | ||
196 | down_read(&osdc->map_sem); | 196 | down_read(&osdc->lock); |
197 | r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, | 197 | r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, |
198 | &dl.object_no, &dl.object_offset, | 198 | &dl.object_no, &dl.object_offset, |
199 | &olen); | 199 | &olen); |
200 | if (r < 0) { | 200 | if (r < 0) { |
201 | up_read(&osdc->map_sem); | 201 | up_read(&osdc->lock); |
202 | return -EIO; | 202 | return -EIO; |
203 | } | 203 | } |
204 | dl.file_offset -= dl.object_offset; | 204 | dl.file_offset -= dl.object_offset; |
@@ -213,15 +213,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) | |||
213 | ceph_ino(inode), dl.object_no); | 213 | ceph_ino(inode), dl.object_no); |
214 | 214 | ||
215 | oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); | 215 | oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); |
216 | ceph_oid_set_name(&oid, dl.object_name); | 216 | ceph_oid_printf(&oid, "%s", dl.object_name); |
217 | 217 | ||
218 | r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); | 218 | r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid); |
219 | if (r < 0) { | 219 | if (r < 0) { |
220 | up_read(&osdc->map_sem); | 220 | up_read(&osdc->lock); |
221 | return r; | 221 | return r; |
222 | } | 222 | } |
223 | 223 | ||
224 | dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); | 224 | dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid); |
225 | if (dl.osd >= 0) { | 225 | if (dl.osd >= 0) { |
226 | struct ceph_entity_addr *a = | 226 | struct ceph_entity_addr *a = |
227 | ceph_osd_addr(osdc->osdmap, dl.osd); | 227 | ceph_osd_addr(osdc->osdmap, dl.osd); |
@@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) | |||
230 | } else { | 230 | } else { |
231 | memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); | 231 | memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); |
232 | } | 232 | } |
233 | up_read(&osdc->map_sem); | 233 | up_read(&osdc->lock); |
234 | 234 | ||
235 | /* send result back to user */ | 235 | /* send result back to user */ |
236 | if (copy_to_user(arg, &dl, sizeof(dl))) | 236 | if (copy_to_user(arg, &dl, sizeof(dl))) |
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 85b8517f17a0..2103b823bec0 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
@@ -181,17 +181,18 @@ static int parse_reply_info_dir(void **p, void *end, | |||
181 | 181 | ||
182 | ceph_decode_need(p, end, sizeof(num) + 2, bad); | 182 | ceph_decode_need(p, end, sizeof(num) + 2, bad); |
183 | num = ceph_decode_32(p); | 183 | num = ceph_decode_32(p); |
184 | info->dir_end = ceph_decode_8(p); | 184 | { |
185 | info->dir_complete = ceph_decode_8(p); | 185 | u16 flags = ceph_decode_16(p); |
186 | info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); | ||
187 | info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); | ||
188 | info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); | ||
189 | } | ||
186 | if (num == 0) | 190 | if (num == 0) |
187 | goto done; | 191 | goto done; |
188 | 192 | ||
189 | BUG_ON(!info->dir_in); | 193 | BUG_ON(!info->dir_entries); |
190 | info->dir_dname = (void *)(info->dir_in + num); | 194 | if ((unsigned long)(info->dir_entries + num) > |
191 | info->dir_dname_len = (void *)(info->dir_dname + num); | 195 | (unsigned long)info->dir_entries + info->dir_buf_size) { |
192 | info->dir_dlease = (void *)(info->dir_dname_len + num); | ||
193 | if ((unsigned long)(info->dir_dlease + num) > | ||
194 | (unsigned long)info->dir_in + info->dir_buf_size) { | ||
195 | pr_err("dir contents are larger than expected\n"); | 196 | pr_err("dir contents are larger than expected\n"); |
196 | WARN_ON(1); | 197 | WARN_ON(1); |
197 | goto bad; | 198 | goto bad; |
@@ -199,21 +200,23 @@ static int parse_reply_info_dir(void **p, void *end, | |||
199 | 200 | ||
200 | info->dir_nr = num; | 201 | info->dir_nr = num; |
201 | while (num) { | 202 | while (num) { |
203 | struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; | ||
202 | /* dentry */ | 204 | /* dentry */ |
203 | ceph_decode_need(p, end, sizeof(u32)*2, bad); | 205 | ceph_decode_need(p, end, sizeof(u32)*2, bad); |
204 | info->dir_dname_len[i] = ceph_decode_32(p); | 206 | rde->name_len = ceph_decode_32(p); |
205 | ceph_decode_need(p, end, info->dir_dname_len[i], bad); | 207 | ceph_decode_need(p, end, rde->name_len, bad); |
206 | info->dir_dname[i] = *p; | 208 | rde->name = *p; |
207 | *p += info->dir_dname_len[i]; | 209 | *p += rde->name_len; |
208 | dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], | 210 | dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name); |
209 | info->dir_dname[i]); | 211 | rde->lease = *p; |
210 | info->dir_dlease[i] = *p; | ||
211 | *p += sizeof(struct ceph_mds_reply_lease); | 212 | *p += sizeof(struct ceph_mds_reply_lease); |
212 | 213 | ||
213 | /* inode */ | 214 | /* inode */ |
214 | err = parse_reply_info_in(p, end, &info->dir_in[i], features); | 215 | err = parse_reply_info_in(p, end, &rde->inode, features); |
215 | if (err < 0) | 216 | if (err < 0) |
216 | goto out_bad; | 217 | goto out_bad; |
218 | /* ceph_readdir_prepopulate() will update it */ | ||
219 | rde->offset = 0; | ||
217 | i++; | 220 | i++; |
218 | num--; | 221 | num--; |
219 | } | 222 | } |
@@ -345,9 +348,9 @@ out_bad: | |||
345 | 348 | ||
346 | static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) | 349 | static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) |
347 | { | 350 | { |
348 | if (!info->dir_in) | 351 | if (!info->dir_entries) |
349 | return; | 352 | return; |
350 | free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); | 353 | free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); |
351 | } | 354 | } |
352 | 355 | ||
353 | 356 | ||
@@ -567,51 +570,23 @@ void ceph_mdsc_release_request(struct kref *kref) | |||
567 | kfree(req); | 570 | kfree(req); |
568 | } | 571 | } |
569 | 572 | ||
573 | DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node) | ||
574 | |||
570 | /* | 575 | /* |
571 | * lookup session, bump ref if found. | 576 | * lookup session, bump ref if found. |
572 | * | 577 | * |
573 | * called under mdsc->mutex. | 578 | * called under mdsc->mutex. |
574 | */ | 579 | */ |
575 | static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, | 580 | static struct ceph_mds_request * |
576 | u64 tid) | 581 | lookup_get_request(struct ceph_mds_client *mdsc, u64 tid) |
577 | { | 582 | { |
578 | struct ceph_mds_request *req; | 583 | struct ceph_mds_request *req; |
579 | struct rb_node *n = mdsc->request_tree.rb_node; | ||
580 | |||
581 | while (n) { | ||
582 | req = rb_entry(n, struct ceph_mds_request, r_node); | ||
583 | if (tid < req->r_tid) | ||
584 | n = n->rb_left; | ||
585 | else if (tid > req->r_tid) | ||
586 | n = n->rb_right; | ||
587 | else { | ||
588 | ceph_mdsc_get_request(req); | ||
589 | return req; | ||
590 | } | ||
591 | } | ||
592 | return NULL; | ||
593 | } | ||
594 | 584 | ||
595 | static void __insert_request(struct ceph_mds_client *mdsc, | 585 | req = lookup_request(&mdsc->request_tree, tid); |
596 | struct ceph_mds_request *new) | 586 | if (req) |
597 | { | 587 | ceph_mdsc_get_request(req); |
598 | struct rb_node **p = &mdsc->request_tree.rb_node; | ||
599 | struct rb_node *parent = NULL; | ||
600 | struct ceph_mds_request *req = NULL; | ||
601 | 588 | ||
602 | while (*p) { | 589 | return req; |
603 | parent = *p; | ||
604 | req = rb_entry(parent, struct ceph_mds_request, r_node); | ||
605 | if (new->r_tid < req->r_tid) | ||
606 | p = &(*p)->rb_left; | ||
607 | else if (new->r_tid > req->r_tid) | ||
608 | p = &(*p)->rb_right; | ||
609 | else | ||
610 | BUG(); | ||
611 | } | ||
612 | |||
613 | rb_link_node(&new->r_node, parent, p); | ||
614 | rb_insert_color(&new->r_node, &mdsc->request_tree); | ||
615 | } | 590 | } |
616 | 591 | ||
617 | /* | 592 | /* |
@@ -630,7 +605,7 @@ static void __register_request(struct ceph_mds_client *mdsc, | |||
630 | req->r_num_caps); | 605 | req->r_num_caps); |
631 | dout("__register_request %p tid %lld\n", req, req->r_tid); | 606 | dout("__register_request %p tid %lld\n", req, req->r_tid); |
632 | ceph_mdsc_get_request(req); | 607 | ceph_mdsc_get_request(req); |
633 | __insert_request(mdsc, req); | 608 | insert_request(&mdsc->request_tree, req); |
634 | 609 | ||
635 | req->r_uid = current_fsuid(); | 610 | req->r_uid = current_fsuid(); |
636 | req->r_gid = current_fsgid(); | 611 | req->r_gid = current_fsgid(); |
@@ -663,8 +638,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, | |||
663 | } | 638 | } |
664 | } | 639 | } |
665 | 640 | ||
666 | rb_erase(&req->r_node, &mdsc->request_tree); | 641 | erase_request(&mdsc->request_tree, req); |
667 | RB_CLEAR_NODE(&req->r_node); | ||
668 | 642 | ||
669 | if (req->r_unsafe_dir && req->r_got_unsafe) { | 643 | if (req->r_unsafe_dir && req->r_got_unsafe) { |
670 | struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); | 644 | struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); |
@@ -868,12 +842,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 | |||
868 | int metadata_bytes = 0; | 842 | int metadata_bytes = 0; |
869 | int metadata_key_count = 0; | 843 | int metadata_key_count = 0; |
870 | struct ceph_options *opt = mdsc->fsc->client->options; | 844 | struct ceph_options *opt = mdsc->fsc->client->options; |
845 | struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; | ||
871 | void *p; | 846 | void *p; |
872 | 847 | ||
873 | const char* metadata[][2] = { | 848 | const char* metadata[][2] = { |
874 | {"hostname", utsname()->nodename}, | 849 | {"hostname", utsname()->nodename}, |
875 | {"kernel_version", utsname()->release}, | 850 | {"kernel_version", utsname()->release}, |
876 | {"entity_id", opt->name ? opt->name : ""}, | 851 | {"entity_id", opt->name ? : ""}, |
852 | {"root", fsopt->server_path ? : "/"}, | ||
877 | {NULL, NULL} | 853 | {NULL, NULL} |
878 | }; | 854 | }; |
879 | 855 | ||
@@ -1149,9 +1125,11 @@ out: | |||
1149 | static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | 1125 | static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, |
1150 | void *arg) | 1126 | void *arg) |
1151 | { | 1127 | { |
1128 | struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; | ||
1152 | struct ceph_inode_info *ci = ceph_inode(inode); | 1129 | struct ceph_inode_info *ci = ceph_inode(inode); |
1153 | LIST_HEAD(to_remove); | 1130 | LIST_HEAD(to_remove); |
1154 | int drop = 0; | 1131 | bool drop = false; |
1132 | bool invalidate = false; | ||
1155 | 1133 | ||
1156 | dout("removing cap %p, ci is %p, inode is %p\n", | 1134 | dout("removing cap %p, ci is %p, inode is %p\n", |
1157 | cap, ci, &ci->vfs_inode); | 1135 | cap, ci, &ci->vfs_inode); |
@@ -1159,8 +1137,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1159 | __ceph_remove_cap(cap, false); | 1137 | __ceph_remove_cap(cap, false); |
1160 | if (!ci->i_auth_cap) { | 1138 | if (!ci->i_auth_cap) { |
1161 | struct ceph_cap_flush *cf; | 1139 | struct ceph_cap_flush *cf; |
1162 | struct ceph_mds_client *mdsc = | 1140 | struct ceph_mds_client *mdsc = fsc->mdsc; |
1163 | ceph_sb_to_client(inode->i_sb)->mdsc; | 1141 | |
1142 | ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; | ||
1143 | |||
1144 | if (ci->i_wrbuffer_ref > 0 && | ||
1145 | ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) | ||
1146 | invalidate = true; | ||
1164 | 1147 | ||
1165 | while (true) { | 1148 | while (true) { |
1166 | struct rb_node *n = rb_first(&ci->i_cap_flush_tree); | 1149 | struct rb_node *n = rb_first(&ci->i_cap_flush_tree); |
@@ -1183,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1183 | inode, ceph_ino(inode)); | 1166 | inode, ceph_ino(inode)); |
1184 | ci->i_dirty_caps = 0; | 1167 | ci->i_dirty_caps = 0; |
1185 | list_del_init(&ci->i_dirty_item); | 1168 | list_del_init(&ci->i_dirty_item); |
1186 | drop = 1; | 1169 | drop = true; |
1187 | } | 1170 | } |
1188 | if (!list_empty(&ci->i_flushing_item)) { | 1171 | if (!list_empty(&ci->i_flushing_item)) { |
1189 | pr_warn_ratelimited( | 1172 | pr_warn_ratelimited( |
@@ -1193,7 +1176,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1193 | ci->i_flushing_caps = 0; | 1176 | ci->i_flushing_caps = 0; |
1194 | list_del_init(&ci->i_flushing_item); | 1177 | list_del_init(&ci->i_flushing_item); |
1195 | mdsc->num_cap_flushing--; | 1178 | mdsc->num_cap_flushing--; |
1196 | drop = 1; | 1179 | drop = true; |
1197 | } | 1180 | } |
1198 | spin_unlock(&mdsc->cap_dirty_lock); | 1181 | spin_unlock(&mdsc->cap_dirty_lock); |
1199 | 1182 | ||
@@ -1210,7 +1193,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1210 | list_del(&cf->list); | 1193 | list_del(&cf->list); |
1211 | ceph_free_cap_flush(cf); | 1194 | ceph_free_cap_flush(cf); |
1212 | } | 1195 | } |
1213 | while (drop--) | 1196 | |
1197 | wake_up_all(&ci->i_cap_wq); | ||
1198 | if (invalidate) | ||
1199 | ceph_queue_invalidate(inode); | ||
1200 | if (drop) | ||
1214 | iput(inode); | 1201 | iput(inode); |
1215 | return 0; | 1202 | return 0; |
1216 | } | 1203 | } |
@@ -1220,12 +1207,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1220 | */ | 1207 | */ |
1221 | static void remove_session_caps(struct ceph_mds_session *session) | 1208 | static void remove_session_caps(struct ceph_mds_session *session) |
1222 | { | 1209 | { |
1210 | struct ceph_fs_client *fsc = session->s_mdsc->fsc; | ||
1211 | struct super_block *sb = fsc->sb; | ||
1223 | dout("remove_session_caps on %p\n", session); | 1212 | dout("remove_session_caps on %p\n", session); |
1224 | iterate_session_caps(session, remove_session_caps_cb, NULL); | 1213 | iterate_session_caps(session, remove_session_caps_cb, fsc); |
1225 | 1214 | ||
1226 | spin_lock(&session->s_cap_lock); | 1215 | spin_lock(&session->s_cap_lock); |
1227 | if (session->s_nr_caps > 0) { | 1216 | if (session->s_nr_caps > 0) { |
1228 | struct super_block *sb = session->s_mdsc->fsc->sb; | ||
1229 | struct inode *inode; | 1217 | struct inode *inode; |
1230 | struct ceph_cap *cap, *prev = NULL; | 1218 | struct ceph_cap *cap, *prev = NULL; |
1231 | struct ceph_vino vino; | 1219 | struct ceph_vino vino; |
@@ -1270,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, | |||
1270 | { | 1258 | { |
1271 | struct ceph_inode_info *ci = ceph_inode(inode); | 1259 | struct ceph_inode_info *ci = ceph_inode(inode); |
1272 | 1260 | ||
1273 | wake_up_all(&ci->i_cap_wq); | ||
1274 | if (arg) { | 1261 | if (arg) { |
1275 | spin_lock(&ci->i_ceph_lock); | 1262 | spin_lock(&ci->i_ceph_lock); |
1276 | ci->i_wanted_max_size = 0; | 1263 | ci->i_wanted_max_size = 0; |
1277 | ci->i_requested_max_size = 0; | 1264 | ci->i_requested_max_size = 0; |
1278 | spin_unlock(&ci->i_ceph_lock); | 1265 | spin_unlock(&ci->i_ceph_lock); |
1279 | } | 1266 | } |
1267 | wake_up_all(&ci->i_cap_wq); | ||
1280 | return 0; | 1268 | return 0; |
1281 | } | 1269 | } |
1282 | 1270 | ||
@@ -1671,8 +1659,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, | |||
1671 | struct ceph_inode_info *ci = ceph_inode(dir); | 1659 | struct ceph_inode_info *ci = ceph_inode(dir); |
1672 | struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; | 1660 | struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; |
1673 | struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; | 1661 | struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; |
1674 | size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + | 1662 | size_t size = sizeof(struct ceph_mds_reply_dir_entry); |
1675 | sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease); | ||
1676 | int order, num_entries; | 1663 | int order, num_entries; |
1677 | 1664 | ||
1678 | spin_lock(&ci->i_ceph_lock); | 1665 | spin_lock(&ci->i_ceph_lock); |
@@ -1683,14 +1670,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, | |||
1683 | 1670 | ||
1684 | order = get_order(size * num_entries); | 1671 | order = get_order(size * num_entries); |
1685 | while (order >= 0) { | 1672 | while (order >= 0) { |
1686 | rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL | | 1673 | rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | |
1687 | __GFP_NOWARN, | 1674 | __GFP_NOWARN, |
1688 | order); | 1675 | order); |
1689 | if (rinfo->dir_in) | 1676 | if (rinfo->dir_entries) |
1690 | break; | 1677 | break; |
1691 | order--; | 1678 | order--; |
1692 | } | 1679 | } |
1693 | if (!rinfo->dir_in) | 1680 | if (!rinfo->dir_entries) |
1694 | return -ENOMEM; | 1681 | return -ENOMEM; |
1695 | 1682 | ||
1696 | num_entries = (PAGE_SIZE << order) / size; | 1683 | num_entries = (PAGE_SIZE << order) / size; |
@@ -1722,6 +1709,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) | |||
1722 | INIT_LIST_HEAD(&req->r_unsafe_target_item); | 1709 | INIT_LIST_HEAD(&req->r_unsafe_target_item); |
1723 | req->r_fmode = -1; | 1710 | req->r_fmode = -1; |
1724 | kref_init(&req->r_kref); | 1711 | kref_init(&req->r_kref); |
1712 | RB_CLEAR_NODE(&req->r_node); | ||
1725 | INIT_LIST_HEAD(&req->r_wait); | 1713 | INIT_LIST_HEAD(&req->r_wait); |
1726 | init_completion(&req->r_completion); | 1714 | init_completion(&req->r_completion); |
1727 | init_completion(&req->r_safe_completion); | 1715 | init_completion(&req->r_safe_completion); |
@@ -2414,7 +2402,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) | |||
2414 | /* get request, session */ | 2402 | /* get request, session */ |
2415 | tid = le64_to_cpu(msg->hdr.tid); | 2403 | tid = le64_to_cpu(msg->hdr.tid); |
2416 | mutex_lock(&mdsc->mutex); | 2404 | mutex_lock(&mdsc->mutex); |
2417 | req = __lookup_request(mdsc, tid); | 2405 | req = lookup_get_request(mdsc, tid); |
2418 | if (!req) { | 2406 | if (!req) { |
2419 | dout("handle_reply on unknown tid %llu\n", tid); | 2407 | dout("handle_reply on unknown tid %llu\n", tid); |
2420 | mutex_unlock(&mdsc->mutex); | 2408 | mutex_unlock(&mdsc->mutex); |
@@ -2604,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, | |||
2604 | fwd_seq = ceph_decode_32(&p); | 2592 | fwd_seq = ceph_decode_32(&p); |
2605 | 2593 | ||
2606 | mutex_lock(&mdsc->mutex); | 2594 | mutex_lock(&mdsc->mutex); |
2607 | req = __lookup_request(mdsc, tid); | 2595 | req = lookup_get_request(mdsc, tid); |
2608 | if (!req) { | 2596 | if (!req) { |
2609 | dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); | 2597 | dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); |
2610 | goto out; /* dup reply? */ | 2598 | goto out; /* dup reply? */ |
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ee69a537dba5..e7d38aac7109 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h | |||
@@ -47,6 +47,14 @@ struct ceph_mds_reply_info_in { | |||
47 | u32 pool_ns_len; | 47 | u32 pool_ns_len; |
48 | }; | 48 | }; |
49 | 49 | ||
50 | struct ceph_mds_reply_dir_entry { | ||
51 | char *name; | ||
52 | u32 name_len; | ||
53 | struct ceph_mds_reply_lease *lease; | ||
54 | struct ceph_mds_reply_info_in inode; | ||
55 | loff_t offset; | ||
56 | }; | ||
57 | |||
50 | /* | 58 | /* |
51 | * parsed info about an mds reply, including information about | 59 | * parsed info about an mds reply, including information about |
52 | * either: 1) the target inode and/or its parent directory and dentry, | 60 | * either: 1) the target inode and/or its parent directory and dentry, |
@@ -73,11 +81,10 @@ struct ceph_mds_reply_info_parsed { | |||
73 | struct ceph_mds_reply_dirfrag *dir_dir; | 81 | struct ceph_mds_reply_dirfrag *dir_dir; |
74 | size_t dir_buf_size; | 82 | size_t dir_buf_size; |
75 | int dir_nr; | 83 | int dir_nr; |
76 | char **dir_dname; | 84 | bool dir_complete; |
77 | u32 *dir_dname_len; | 85 | bool dir_end; |
78 | struct ceph_mds_reply_lease **dir_dlease; | 86 | bool hash_order; |
79 | struct ceph_mds_reply_info_in *dir_in; | 87 | struct ceph_mds_reply_dir_entry *dir_entries; |
80 | u8 dir_complete, dir_end; | ||
81 | }; | 88 | }; |
82 | 89 | ||
83 | /* for create results */ | 90 | /* for create results */ |
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 261531e55e9d..8c3591a7fbae 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c | |||
@@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) | |||
54 | const void *start = *p; | 54 | const void *start = *p; |
55 | int i, j, n; | 55 | int i, j, n; |
56 | int err = -EINVAL; | 56 | int err = -EINVAL; |
57 | u16 version; | 57 | u8 mdsmap_v, mdsmap_cv; |
58 | 58 | ||
59 | m = kzalloc(sizeof(*m), GFP_NOFS); | 59 | m = kzalloc(sizeof(*m), GFP_NOFS); |
60 | if (m == NULL) | 60 | if (m == NULL) |
61 | return ERR_PTR(-ENOMEM); | 61 | return ERR_PTR(-ENOMEM); |
62 | 62 | ||
63 | ceph_decode_16_safe(p, end, version, bad); | 63 | ceph_decode_need(p, end, 1 + 1, bad); |
64 | if (version > 3) { | 64 | mdsmap_v = ceph_decode_8(p); |
65 | pr_warn("got mdsmap version %d > 3, failing", version); | 65 | mdsmap_cv = ceph_decode_8(p); |
66 | goto bad; | 66 | if (mdsmap_v >= 4) { |
67 | u32 mdsmap_len; | ||
68 | ceph_decode_32_safe(p, end, mdsmap_len, bad); | ||
69 | if (end < *p + mdsmap_len) | ||
70 | goto bad; | ||
71 | end = *p + mdsmap_len; | ||
67 | } | 72 | } |
68 | 73 | ||
69 | ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); | 74 | ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); |
@@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) | |||
87 | u32 namelen; | 92 | u32 namelen; |
88 | s32 mds, inc, state; | 93 | s32 mds, inc, state; |
89 | u64 state_seq; | 94 | u64 state_seq; |
90 | u8 infoversion; | 95 | u8 info_v; |
96 | void *info_end = NULL; | ||
91 | struct ceph_entity_addr addr; | 97 | struct ceph_entity_addr addr; |
92 | u32 num_export_targets; | 98 | u32 num_export_targets; |
93 | void *pexport_targets = NULL; | 99 | void *pexport_targets = NULL; |
94 | struct ceph_timespec laggy_since; | 100 | struct ceph_timespec laggy_since; |
95 | struct ceph_mds_info *info; | 101 | struct ceph_mds_info *info; |
96 | 102 | ||
97 | ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); | 103 | ceph_decode_need(p, end, sizeof(u64) + 1, bad); |
98 | global_id = ceph_decode_64(p); | 104 | global_id = ceph_decode_64(p); |
99 | infoversion = ceph_decode_8(p); | 105 | info_v= ceph_decode_8(p); |
106 | if (info_v >= 4) { | ||
107 | u32 info_len; | ||
108 | u8 info_cv; | ||
109 | ceph_decode_need(p, end, 1 + sizeof(u32), bad); | ||
110 | info_cv = ceph_decode_8(p); | ||
111 | info_len = ceph_decode_32(p); | ||
112 | info_end = *p + info_len; | ||
113 | if (info_end > end) | ||
114 | goto bad; | ||
115 | } | ||
116 | |||
117 | ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); | ||
100 | *p += sizeof(u64); | 118 | *p += sizeof(u64); |
101 | namelen = ceph_decode_32(p); /* skip mds name */ | 119 | namelen = ceph_decode_32(p); /* skip mds name */ |
102 | *p += namelen; | 120 | *p += namelen; |
@@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) | |||
115 | *p += sizeof(u32); | 133 | *p += sizeof(u32); |
116 | ceph_decode_32_safe(p, end, namelen, bad); | 134 | ceph_decode_32_safe(p, end, namelen, bad); |
117 | *p += namelen; | 135 | *p += namelen; |
118 | if (infoversion >= 2) { | 136 | if (info_v >= 2) { |
119 | ceph_decode_32_safe(p, end, num_export_targets, bad); | 137 | ceph_decode_32_safe(p, end, num_export_targets, bad); |
120 | pexport_targets = *p; | 138 | pexport_targets = *p; |
121 | *p += num_export_targets * sizeof(u32); | 139 | *p += num_export_targets * sizeof(u32); |
@@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) | |||
123 | num_export_targets = 0; | 141 | num_export_targets = 0; |
124 | } | 142 | } |
125 | 143 | ||
144 | if (info_end && *p != info_end) { | ||
145 | if (*p > info_end) | ||
146 | goto bad; | ||
147 | *p = info_end; | ||
148 | } | ||
149 | |||
126 | dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", | 150 | dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", |
127 | i+1, n, global_id, mds, inc, | 151 | i+1, n, global_id, mds, inc, |
128 | ceph_pr_addr(&addr.in_addr), | 152 | ceph_pr_addr(&addr.in_addr), |
@@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) | |||
163 | m->m_cas_pg_pool = ceph_decode_64(p); | 187 | m->m_cas_pg_pool = ceph_decode_64(p); |
164 | 188 | ||
165 | /* ok, we don't care about the rest. */ | 189 | /* ok, we don't care about the rest. */ |
190 | *p = end; | ||
166 | dout("mdsmap_decode success epoch %u\n", m->m_epoch); | 191 | dout("mdsmap_decode success epoch %u\n", m->m_epoch); |
167 | return m; | 192 | return m; |
168 | 193 | ||
diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f12d5e2955c2..91e02481ce06 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c | |||
@@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait) | |||
108 | * mount options | 108 | * mount options |
109 | */ | 109 | */ |
110 | enum { | 110 | enum { |
111 | Opt_mds_namespace, | ||
111 | Opt_wsize, | 112 | Opt_wsize, |
112 | Opt_rsize, | 113 | Opt_rsize, |
113 | Opt_rasize, | 114 | Opt_rasize, |
@@ -143,6 +144,7 @@ enum { | |||
143 | }; | 144 | }; |
144 | 145 | ||
145 | static match_table_t fsopt_tokens = { | 146 | static match_table_t fsopt_tokens = { |
147 | {Opt_mds_namespace, "mds_namespace=%d"}, | ||
146 | {Opt_wsize, "wsize=%d"}, | 148 | {Opt_wsize, "wsize=%d"}, |
147 | {Opt_rsize, "rsize=%d"}, | 149 | {Opt_rsize, "rsize=%d"}, |
148 | {Opt_rasize, "rasize=%d"}, | 150 | {Opt_rasize, "rasize=%d"}, |
@@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private) | |||
212 | break; | 214 | break; |
213 | 215 | ||
214 | /* misc */ | 216 | /* misc */ |
217 | case Opt_mds_namespace: | ||
218 | fsopt->mds_namespace = intval; | ||
219 | break; | ||
215 | case Opt_wsize: | 220 | case Opt_wsize: |
216 | fsopt->wsize = intval; | 221 | fsopt->wsize = intval; |
217 | break; | 222 | break; |
@@ -297,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) | |||
297 | { | 302 | { |
298 | dout("destroy_mount_options %p\n", args); | 303 | dout("destroy_mount_options %p\n", args); |
299 | kfree(args->snapdir_name); | 304 | kfree(args->snapdir_name); |
305 | kfree(args->server_path); | ||
300 | kfree(args); | 306 | kfree(args); |
301 | } | 307 | } |
302 | 308 | ||
@@ -328,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, | |||
328 | if (ret) | 334 | if (ret) |
329 | return ret; | 335 | return ret; |
330 | 336 | ||
337 | ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); | ||
338 | if (ret) | ||
339 | return ret; | ||
340 | |||
331 | return ceph_compare_options(new_opt, fsc->client); | 341 | return ceph_compare_options(new_opt, fsc->client); |
332 | } | 342 | } |
333 | 343 | ||
334 | static int parse_mount_options(struct ceph_mount_options **pfsopt, | 344 | static int parse_mount_options(struct ceph_mount_options **pfsopt, |
335 | struct ceph_options **popt, | 345 | struct ceph_options **popt, |
336 | int flags, char *options, | 346 | int flags, char *options, |
337 | const char *dev_name, | 347 | const char *dev_name) |
338 | const char **path) | ||
339 | { | 348 | { |
340 | struct ceph_mount_options *fsopt; | 349 | struct ceph_mount_options *fsopt; |
341 | const char *dev_name_end; | 350 | const char *dev_name_end; |
@@ -367,6 +376,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, | |||
367 | fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; | 376 | fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; |
368 | fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; | 377 | fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; |
369 | fsopt->congestion_kb = default_congestion_kb(); | 378 | fsopt->congestion_kb = default_congestion_kb(); |
379 | fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE; | ||
370 | 380 | ||
371 | /* | 381 | /* |
372 | * Distinguish the server list from the path in "dev_name". | 382 | * Distinguish the server list from the path in "dev_name". |
@@ -380,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, | |||
380 | */ | 390 | */ |
381 | dev_name_end = strchr(dev_name, '/'); | 391 | dev_name_end = strchr(dev_name, '/'); |
382 | if (dev_name_end) { | 392 | if (dev_name_end) { |
383 | /* skip over leading '/' for path */ | 393 | fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); |
384 | *path = dev_name_end + 1; | 394 | if (!fsopt->server_path) { |
395 | err = -ENOMEM; | ||
396 | goto out; | ||
397 | } | ||
385 | } else { | 398 | } else { |
386 | /* path is empty */ | ||
387 | dev_name_end = dev_name + strlen(dev_name); | 399 | dev_name_end = dev_name + strlen(dev_name); |
388 | *path = dev_name_end; | ||
389 | } | 400 | } |
390 | err = -EINVAL; | 401 | err = -EINVAL; |
391 | dev_name_end--; /* back up to ':' separator */ | 402 | dev_name_end--; /* back up to ':' separator */ |
@@ -395,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, | |||
395 | goto out; | 406 | goto out; |
396 | } | 407 | } |
397 | dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); | 408 | dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); |
398 | dout("server path '%s'\n", *path); | 409 | if (fsopt->server_path) |
410 | dout("server path '%s'\n", fsopt->server_path); | ||
399 | 411 | ||
400 | *popt = ceph_parse_options(options, dev_name, dev_name_end, | 412 | *popt = ceph_parse_options(options, dev_name, dev_name_end, |
401 | parse_fsopt_token, (void *)fsopt); | 413 | parse_fsopt_token, (void *)fsopt); |
@@ -457,6 +469,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) | |||
457 | seq_puts(m, ",noacl"); | 469 | seq_puts(m, ",noacl"); |
458 | #endif | 470 | #endif |
459 | 471 | ||
472 | if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE) | ||
473 | seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace); | ||
460 | if (fsopt->wsize) | 474 | if (fsopt->wsize) |
461 | seq_printf(m, ",wsize=%d", fsopt->wsize); | 475 | seq_printf(m, ",wsize=%d", fsopt->wsize); |
462 | if (fsopt->rsize != CEPH_RSIZE_DEFAULT) | 476 | if (fsopt->rsize != CEPH_RSIZE_DEFAULT) |
@@ -511,9 +525,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, | |||
511 | { | 525 | { |
512 | struct ceph_fs_client *fsc; | 526 | struct ceph_fs_client *fsc; |
513 | const u64 supported_features = | 527 | const u64 supported_features = |
514 | CEPH_FEATURE_FLOCK | | 528 | CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH | |
515 | CEPH_FEATURE_DIRLAYOUTHASH | | 529 | CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA; |
516 | CEPH_FEATURE_MDS_INLINE_DATA; | ||
517 | const u64 required_features = 0; | 530 | const u64 required_features = 0; |
518 | int page_count; | 531 | int page_count; |
519 | size_t size; | 532 | size_t size; |
@@ -530,6 +543,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, | |||
530 | goto fail; | 543 | goto fail; |
531 | } | 544 | } |
532 | fsc->client->extra_mon_dispatch = extra_mon_dispatch; | 545 | fsc->client->extra_mon_dispatch = extra_mon_dispatch; |
546 | fsc->client->monc.fs_cluster_id = fsopt->mds_namespace; | ||
533 | ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); | 547 | ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); |
534 | 548 | ||
535 | fsc->mount_options = fsopt; | 549 | fsc->mount_options = fsopt; |
@@ -785,8 +799,7 @@ out: | |||
785 | /* | 799 | /* |
786 | * mount: join the ceph cluster, and open root directory. | 800 | * mount: join the ceph cluster, and open root directory. |
787 | */ | 801 | */ |
788 | static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, | 802 | static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) |
789 | const char *path) | ||
790 | { | 803 | { |
791 | int err; | 804 | int err; |
792 | unsigned long started = jiffies; /* note the start time */ | 805 | unsigned long started = jiffies; /* note the start time */ |
@@ -815,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, | |||
815 | goto fail; | 828 | goto fail; |
816 | } | 829 | } |
817 | 830 | ||
818 | if (path[0] == 0) { | 831 | if (!fsc->mount_options->server_path) { |
819 | root = fsc->sb->s_root; | 832 | root = fsc->sb->s_root; |
820 | dget(root); | 833 | dget(root); |
821 | } else { | 834 | } else { |
822 | dout("mount opening base mountpoint\n"); | 835 | const char *path = fsc->mount_options->server_path + 1; |
836 | dout("mount opening path %s\n", path); | ||
823 | root = open_root_dentry(fsc, path, started); | 837 | root = open_root_dentry(fsc, path, started); |
824 | if (IS_ERR(root)) { | 838 | if (IS_ERR(root)) { |
825 | err = PTR_ERR(root); | 839 | err = PTR_ERR(root); |
@@ -935,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, | |||
935 | struct dentry *res; | 949 | struct dentry *res; |
936 | int err; | 950 | int err; |
937 | int (*compare_super)(struct super_block *, void *) = ceph_compare_super; | 951 | int (*compare_super)(struct super_block *, void *) = ceph_compare_super; |
938 | const char *path = NULL; | ||
939 | struct ceph_mount_options *fsopt = NULL; | 952 | struct ceph_mount_options *fsopt = NULL; |
940 | struct ceph_options *opt = NULL; | 953 | struct ceph_options *opt = NULL; |
941 | 954 | ||
@@ -944,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, | |||
944 | #ifdef CONFIG_CEPH_FS_POSIX_ACL | 957 | #ifdef CONFIG_CEPH_FS_POSIX_ACL |
945 | flags |= MS_POSIXACL; | 958 | flags |= MS_POSIXACL; |
946 | #endif | 959 | #endif |
947 | err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); | 960 | err = parse_mount_options(&fsopt, &opt, flags, data, dev_name); |
948 | if (err < 0) { | 961 | if (err < 0) { |
949 | res = ERR_PTR(err); | 962 | res = ERR_PTR(err); |
950 | goto out_final; | 963 | goto out_final; |
@@ -987,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, | |||
987 | } | 1000 | } |
988 | } | 1001 | } |
989 | 1002 | ||
990 | res = ceph_real_mount(fsc, path); | 1003 | res = ceph_real_mount(fsc); |
991 | if (IS_ERR(res)) | 1004 | if (IS_ERR(res)) |
992 | goto out_splat; | 1005 | goto out_splat; |
993 | dout("root %p inode %p ino %llx.%llx\n", res, | 1006 | dout("root %p inode %p ino %llx.%llx\n", res, |
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7b99eb756477..0130a8592191 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
@@ -62,6 +62,7 @@ struct ceph_mount_options { | |||
62 | int cap_release_safety; | 62 | int cap_release_safety; |
63 | int max_readdir; /* max readdir result (entires) */ | 63 | int max_readdir; /* max readdir result (entires) */ |
64 | int max_readdir_bytes; /* max readdir result (bytes) */ | 64 | int max_readdir_bytes; /* max readdir result (bytes) */ |
65 | int mds_namespace; | ||
65 | 66 | ||
66 | /* | 67 | /* |
67 | * everything above this point can be memcmp'd; everything below | 68 | * everything above this point can be memcmp'd; everything below |
@@ -69,6 +70,7 @@ struct ceph_mount_options { | |||
69 | */ | 70 | */ |
70 | 71 | ||
71 | char *snapdir_name; /* default ".snap" */ | 72 | char *snapdir_name; /* default ".snap" */ |
73 | char *server_path; /* default "/" */ | ||
72 | }; | 74 | }; |
73 | 75 | ||
74 | struct ceph_fs_client { | 76 | struct ceph_fs_client { |
@@ -295,6 +297,7 @@ struct ceph_inode_info { | |||
295 | u64 i_files, i_subdirs; | 297 | u64 i_files, i_subdirs; |
296 | 298 | ||
297 | struct rb_root i_fragtree; | 299 | struct rb_root i_fragtree; |
300 | int i_fragtree_nsplits; | ||
298 | struct mutex i_fragtree_mutex; | 301 | struct mutex i_fragtree_mutex; |
299 | 302 | ||
300 | struct ceph_inode_xattrs_info i_xattrs; | 303 | struct ceph_inode_xattrs_info i_xattrs; |
@@ -469,6 +472,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, | |||
469 | #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ | 472 | #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ |
470 | #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ | 473 | #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ |
471 | #define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ | 474 | #define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ |
475 | #define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ | ||
472 | 476 | ||
473 | static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, | 477 | static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, |
474 | long long release_count, | 478 | long long release_count, |
@@ -537,11 +541,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) | |||
537 | return (struct ceph_dentry_info *)dentry->d_fsdata; | 541 | return (struct ceph_dentry_info *)dentry->d_fsdata; |
538 | } | 542 | } |
539 | 543 | ||
540 | static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) | ||
541 | { | ||
542 | return ((loff_t)frag << 32) | (loff_t)off; | ||
543 | } | ||
544 | |||
545 | /* | 544 | /* |
546 | * caps helpers | 545 | * caps helpers |
547 | */ | 546 | */ |
@@ -632,7 +631,6 @@ struct ceph_file_info { | |||
632 | struct ceph_mds_request *last_readdir; | 631 | struct ceph_mds_request *last_readdir; |
633 | 632 | ||
634 | /* readdir: position within a frag */ | 633 | /* readdir: position within a frag */ |
635 | unsigned offset; /* offset of last chunk, adjusted for . and .. */ | ||
636 | unsigned next_offset; /* offset of next chunk (last_name's + 1) */ | 634 | unsigned next_offset; /* offset of next chunk (last_name's + 1) */ |
637 | char *last_name; /* last entry in previous chunk */ | 635 | char *last_name; /* last entry in previous chunk */ |
638 | long long dir_release_count; | 636 | long long dir_release_count; |
@@ -927,6 +925,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); | |||
927 | /* file.c */ | 925 | /* file.c */ |
928 | extern const struct file_operations ceph_file_fops; | 926 | extern const struct file_operations ceph_file_fops; |
929 | 927 | ||
928 | extern int ceph_renew_caps(struct inode *inode); | ||
930 | extern int ceph_open(struct inode *inode, struct file *file); | 929 | extern int ceph_open(struct inode *inode, struct file *file); |
931 | extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, | 930 | extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, |
932 | struct file *file, unsigned flags, umode_t mode, | 931 | struct file *file, unsigned flags, umode_t mode, |
@@ -942,6 +941,7 @@ extern const struct inode_operations ceph_snapdir_iops; | |||
942 | extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, | 941 | extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, |
943 | ceph_snapdir_dentry_ops; | 942 | ceph_snapdir_dentry_ops; |
944 | 943 | ||
944 | extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); | ||
945 | extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); | 945 | extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); |
946 | extern int ceph_handle_snapdir(struct ceph_mds_request *req, | 946 | extern int ceph_handle_snapdir(struct ceph_mds_request *req, |
947 | struct dentry *dentry, int err); | 947 | struct dentry *dentry, int err); |
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 0d66722c6a52..dacc1bd85629 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c | |||
@@ -77,7 +77,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, | |||
77 | char buf[128]; | 77 | char buf[128]; |
78 | 78 | ||
79 | dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); | 79 | dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); |
80 | down_read(&osdc->map_sem); | 80 | down_read(&osdc->lock); |
81 | pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); | 81 | pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); |
82 | if (pool_name) { | 82 | if (pool_name) { |
83 | size_t len = strlen(pool_name); | 83 | size_t len = strlen(pool_name); |
@@ -109,7 +109,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, | |||
109 | ret = -ERANGE; | 109 | ret = -ERANGE; |
110 | } | 110 | } |
111 | } | 111 | } |
112 | up_read(&osdc->map_sem); | 112 | up_read(&osdc->lock); |
113 | return ret; | 113 | return ret; |
114 | } | 114 | } |
115 | 115 | ||
@@ -143,13 +143,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, | |||
143 | s64 pool = ceph_file_layout_pg_pool(ci->i_layout); | 143 | s64 pool = ceph_file_layout_pg_pool(ci->i_layout); |
144 | const char *pool_name; | 144 | const char *pool_name; |
145 | 145 | ||
146 | down_read(&osdc->map_sem); | 146 | down_read(&osdc->lock); |
147 | pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); | 147 | pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); |
148 | if (pool_name) | 148 | if (pool_name) |
149 | ret = snprintf(val, size, "%s", pool_name); | 149 | ret = snprintf(val, size, "%s", pool_name); |
150 | else | 150 | else |
151 | ret = snprintf(val, size, "%lld", (unsigned long long)pool); | 151 | ret = snprintf(val, size, "%lld", (unsigned long long)pool); |
152 | up_read(&osdc->map_sem); | 152 | up_read(&osdc->lock); |
153 | return ret; | 153 | return ret; |
154 | } | 154 | } |
155 | 155 | ||
@@ -862,6 +862,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, | |||
862 | struct ceph_mds_request *req; | 862 | struct ceph_mds_request *req; |
863 | struct ceph_mds_client *mdsc = fsc->mdsc; | 863 | struct ceph_mds_client *mdsc = fsc->mdsc; |
864 | struct ceph_pagelist *pagelist = NULL; | 864 | struct ceph_pagelist *pagelist = NULL; |
865 | int op = CEPH_MDS_OP_SETXATTR; | ||
865 | int err; | 866 | int err; |
866 | 867 | ||
867 | if (size > 0) { | 868 | if (size > 0) { |
@@ -875,20 +876,21 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, | |||
875 | if (err) | 876 | if (err) |
876 | goto out; | 877 | goto out; |
877 | } else if (!value) { | 878 | } else if (!value) { |
878 | flags |= CEPH_XATTR_REMOVE; | 879 | if (flags & CEPH_XATTR_REPLACE) |
880 | op = CEPH_MDS_OP_RMXATTR; | ||
881 | else | ||
882 | flags |= CEPH_XATTR_REMOVE; | ||
879 | } | 883 | } |
880 | 884 | ||
881 | dout("setxattr value=%.*s\n", (int)size, value); | 885 | dout("setxattr value=%.*s\n", (int)size, value); |
882 | 886 | ||
883 | /* do request */ | 887 | /* do request */ |
884 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, | 888 | req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); |
885 | USE_AUTH_MDS); | ||
886 | if (IS_ERR(req)) { | 889 | if (IS_ERR(req)) { |
887 | err = PTR_ERR(req); | 890 | err = PTR_ERR(req); |
888 | goto out; | 891 | goto out; |
889 | } | 892 | } |
890 | 893 | ||
891 | req->r_args.setxattr.flags = cpu_to_le32(flags); | ||
892 | req->r_path2 = kstrdup(name, GFP_NOFS); | 894 | req->r_path2 = kstrdup(name, GFP_NOFS); |
893 | if (!req->r_path2) { | 895 | if (!req->r_path2) { |
894 | ceph_mdsc_put_request(req); | 896 | ceph_mdsc_put_request(req); |
@@ -896,8 +898,11 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, | |||
896 | goto out; | 898 | goto out; |
897 | } | 899 | } |
898 | 900 | ||
899 | req->r_pagelist = pagelist; | 901 | if (op == CEPH_MDS_OP_SETXATTR) { |
900 | pagelist = NULL; | 902 | req->r_args.setxattr.flags = cpu_to_le32(flags); |
903 | req->r_pagelist = pagelist; | ||
904 | pagelist = NULL; | ||
905 | } | ||
901 | 906 | ||
902 | req->r_inode = inode; | 907 | req->r_inode = inode; |
903 | ihold(inode); | 908 | ihold(inode); |
diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h index b827e066e55a..146507df8650 100644 --- a/include/linux/ceph/ceph_frag.h +++ b/include/linux/ceph/ceph_frag.h | |||
@@ -51,11 +51,11 @@ static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) | |||
51 | return ceph_frag_make(newbits, | 51 | return ceph_frag_make(newbits, |
52 | ceph_frag_value(f) | (i << (24 - newbits))); | 52 | ceph_frag_value(f) | (i << (24 - newbits))); |
53 | } | 53 | } |
54 | static inline int ceph_frag_is_leftmost(__u32 f) | 54 | static inline bool ceph_frag_is_leftmost(__u32 f) |
55 | { | 55 | { |
56 | return ceph_frag_value(f) == 0; | 56 | return ceph_frag_value(f) == 0; |
57 | } | 57 | } |
58 | static inline int ceph_frag_is_rightmost(__u32 f) | 58 | static inline bool ceph_frag_is_rightmost(__u32 f) |
59 | { | 59 | { |
60 | return ceph_frag_value(f) == ceph_frag_mask(f); | 60 | return ceph_frag_value(f) == ceph_frag_mask(f); |
61 | } | 61 | } |
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 37f28bf55ce4..dfce616002ad 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h | |||
@@ -153,8 +153,9 @@ struct ceph_dir_layout { | |||
153 | 153 | ||
154 | /* watch-notify operations */ | 154 | /* watch-notify operations */ |
155 | enum { | 155 | enum { |
156 | WATCH_NOTIFY = 1, /* notifying watcher */ | 156 | CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */ |
157 | WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */ | 157 | CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */ |
158 | CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */ | ||
158 | }; | 159 | }; |
159 | 160 | ||
160 | 161 | ||
@@ -207,6 +208,8 @@ struct ceph_mon_subscribe_ack { | |||
207 | struct ceph_fsid fsid; | 208 | struct ceph_fsid fsid; |
208 | } __attribute__ ((packed)); | 209 | } __attribute__ ((packed)); |
209 | 210 | ||
211 | #define CEPH_FS_CLUSTER_ID_NONE -1 | ||
212 | |||
210 | /* | 213 | /* |
211 | * mdsmap flags | 214 | * mdsmap flags |
212 | */ | 215 | */ |
@@ -344,6 +347,18 @@ extern const char *ceph_mds_op_name(int op); | |||
344 | #define CEPH_XATTR_REPLACE (1 << 1) | 347 | #define CEPH_XATTR_REPLACE (1 << 1) |
345 | #define CEPH_XATTR_REMOVE (1 << 31) | 348 | #define CEPH_XATTR_REMOVE (1 << 31) |
346 | 349 | ||
350 | /* | ||
351 | * readdir request flags; | ||
352 | */ | ||
353 | #define CEPH_READDIR_REPLY_BITFLAGS (1<<0) | ||
354 | |||
355 | /* | ||
356 | * readdir reply flags. | ||
357 | */ | ||
358 | #define CEPH_READDIR_FRAG_END (1<<0) | ||
359 | #define CEPH_READDIR_FRAG_COMPLETE (1<<8) | ||
360 | #define CEPH_READDIR_HASH_ORDER (1<<9) | ||
361 | |||
347 | union ceph_mds_request_args { | 362 | union ceph_mds_request_args { |
348 | struct { | 363 | struct { |
349 | __le32 mask; /* CEPH_CAP_* */ | 364 | __le32 mask; /* CEPH_CAP_* */ |
@@ -361,6 +376,7 @@ union ceph_mds_request_args { | |||
361 | __le32 frag; /* which dir fragment */ | 376 | __le32 frag; /* which dir fragment */ |
362 | __le32 max_entries; /* how many dentries to grab */ | 377 | __le32 max_entries; /* how many dentries to grab */ |
363 | __le32 max_bytes; | 378 | __le32 max_bytes; |
379 | __le16 flags; | ||
364 | } __attribute__ ((packed)) readdir; | 380 | } __attribute__ ((packed)) readdir; |
365 | struct { | 381 | struct { |
366 | __le32 mode; | 382 | __le32 mode; |
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index a6ef9cc267ec..19e9932f3e77 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h | |||
@@ -47,7 +47,7 @@ static inline void ceph_decode_copy(void **p, void *pv, size_t n) | |||
47 | /* | 47 | /* |
48 | * bounds check input. | 48 | * bounds check input. |
49 | */ | 49 | */ |
50 | static inline int ceph_has_room(void **p, void *end, size_t n) | 50 | static inline bool ceph_has_room(void **p, void *end, size_t n) |
51 | { | 51 | { |
52 | return end >= *p && n <= end - *p; | 52 | return end >= *p && n <= end - *p; |
53 | } | 53 | } |
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index db92a8d4926e..690985daad1c 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h | |||
@@ -180,6 +180,63 @@ static inline int calc_pages_for(u64 off, u64 len) | |||
180 | (off >> PAGE_SHIFT); | 180 | (off >> PAGE_SHIFT); |
181 | } | 181 | } |
182 | 182 | ||
183 | /* | ||
184 | * These are not meant to be generic - an integer key is assumed. | ||
185 | */ | ||
186 | #define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ | ||
187 | static void insert_##name(struct rb_root *root, type *t) \ | ||
188 | { \ | ||
189 | struct rb_node **n = &root->rb_node; \ | ||
190 | struct rb_node *parent = NULL; \ | ||
191 | \ | ||
192 | BUG_ON(!RB_EMPTY_NODE(&t->nodefld)); \ | ||
193 | \ | ||
194 | while (*n) { \ | ||
195 | type *cur = rb_entry(*n, type, nodefld); \ | ||
196 | \ | ||
197 | parent = *n; \ | ||
198 | if (t->keyfld < cur->keyfld) \ | ||
199 | n = &(*n)->rb_left; \ | ||
200 | else if (t->keyfld > cur->keyfld) \ | ||
201 | n = &(*n)->rb_right; \ | ||
202 | else \ | ||
203 | BUG(); \ | ||
204 | } \ | ||
205 | \ | ||
206 | rb_link_node(&t->nodefld, parent, n); \ | ||
207 | rb_insert_color(&t->nodefld, root); \ | ||
208 | } \ | ||
209 | static void erase_##name(struct rb_root *root, type *t) \ | ||
210 | { \ | ||
211 | BUG_ON(RB_EMPTY_NODE(&t->nodefld)); \ | ||
212 | rb_erase(&t->nodefld, root); \ | ||
213 | RB_CLEAR_NODE(&t->nodefld); \ | ||
214 | } | ||
215 | |||
216 | #define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \ | ||
217 | static type *lookup_##name(struct rb_root *root, \ | ||
218 | typeof(((type *)0)->keyfld) key) \ | ||
219 | { \ | ||
220 | struct rb_node *n = root->rb_node; \ | ||
221 | \ | ||
222 | while (n) { \ | ||
223 | type *cur = rb_entry(n, type, nodefld); \ | ||
224 | \ | ||
225 | if (key < cur->keyfld) \ | ||
226 | n = n->rb_left; \ | ||
227 | else if (key > cur->keyfld) \ | ||
228 | n = n->rb_right; \ | ||
229 | else \ | ||
230 | return cur; \ | ||
231 | } \ | ||
232 | \ | ||
233 | return NULL; \ | ||
234 | } | ||
235 | |||
236 | #define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \ | ||
237 | DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ | ||
238 | DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) | ||
239 | |||
183 | extern struct kmem_cache *ceph_inode_cachep; | 240 | extern struct kmem_cache *ceph_inode_cachep; |
184 | extern struct kmem_cache *ceph_cap_cachep; | 241 | extern struct kmem_cache *ceph_cap_cachep; |
185 | extern struct kmem_cache *ceph_cap_flush_cachep; | 242 | extern struct kmem_cache *ceph_cap_flush_cachep; |
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index e230e7ed60d3..e2a92df08b47 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h | |||
@@ -39,20 +39,31 @@ struct ceph_mon_request { | |||
39 | ceph_monc_request_func_t do_request; | 39 | ceph_monc_request_func_t do_request; |
40 | }; | 40 | }; |
41 | 41 | ||
42 | typedef void (*ceph_monc_callback_t)(struct ceph_mon_generic_request *); | ||
43 | |||
42 | /* | 44 | /* |
43 | * ceph_mon_generic_request is being used for the statfs and | 45 | * ceph_mon_generic_request is being used for the statfs and |
44 | * mon_get_version requests which are being done a bit differently | 46 | * mon_get_version requests which are being done a bit differently |
45 | * because we need to get data back to the caller | 47 | * because we need to get data back to the caller |
46 | */ | 48 | */ |
47 | struct ceph_mon_generic_request { | 49 | struct ceph_mon_generic_request { |
50 | struct ceph_mon_client *monc; | ||
48 | struct kref kref; | 51 | struct kref kref; |
49 | u64 tid; | 52 | u64 tid; |
50 | struct rb_node node; | 53 | struct rb_node node; |
51 | int result; | 54 | int result; |
52 | void *buf; | 55 | |
53 | struct completion completion; | 56 | struct completion completion; |
57 | ceph_monc_callback_t complete_cb; | ||
58 | u64 private_data; /* r_tid/linger_id */ | ||
59 | |||
54 | struct ceph_msg *request; /* original request */ | 60 | struct ceph_msg *request; /* original request */ |
55 | struct ceph_msg *reply; /* and reply */ | 61 | struct ceph_msg *reply; /* and reply */ |
62 | |||
63 | union { | ||
64 | struct ceph_statfs *st; | ||
65 | u64 newest; | ||
66 | } u; | ||
56 | }; | 67 | }; |
57 | 68 | ||
58 | struct ceph_mon_client { | 69 | struct ceph_mon_client { |
@@ -77,7 +88,6 @@ struct ceph_mon_client { | |||
77 | 88 | ||
78 | /* pending generic requests */ | 89 | /* pending generic requests */ |
79 | struct rb_root generic_request_tree; | 90 | struct rb_root generic_request_tree; |
80 | int num_generic_requests; | ||
81 | u64 last_tid; | 91 | u64 last_tid; |
82 | 92 | ||
83 | /* subs, indexed with CEPH_SUB_* */ | 93 | /* subs, indexed with CEPH_SUB_* */ |
@@ -86,6 +96,7 @@ struct ceph_mon_client { | |||
86 | bool want; | 96 | bool want; |
87 | u32 have; /* epoch */ | 97 | u32 have; /* epoch */ |
88 | } subs[3]; | 98 | } subs[3]; |
99 | int fs_cluster_id; /* "mdsmap.<id>" sub */ | ||
89 | 100 | ||
90 | #ifdef CONFIG_DEBUG_FS | 101 | #ifdef CONFIG_DEBUG_FS |
91 | struct dentry *debugfs_file; | 102 | struct dentry *debugfs_file; |
@@ -116,16 +127,18 @@ extern const char *ceph_sub_str[]; | |||
116 | bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, | 127 | bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, |
117 | bool continuous); | 128 | bool continuous); |
118 | void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); | 129 | void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); |
130 | void ceph_monc_renew_subs(struct ceph_mon_client *monc); | ||
119 | 131 | ||
120 | extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); | ||
121 | extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, | 132 | extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, |
122 | unsigned long timeout); | 133 | unsigned long timeout); |
123 | 134 | ||
124 | extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, | 135 | extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, |
125 | struct ceph_statfs *buf); | 136 | struct ceph_statfs *buf); |
126 | 137 | ||
127 | extern int ceph_monc_do_get_version(struct ceph_mon_client *monc, | 138 | int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, |
128 | const char *what, u64 *newest); | 139 | u64 *newest); |
140 | int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what, | ||
141 | ceph_monc_callback_t cb, u64 private_data); | ||
129 | 142 | ||
130 | extern int ceph_monc_open_session(struct ceph_mon_client *monc); | 143 | extern int ceph_monc_open_session(struct ceph_mon_client *monc); |
131 | 144 | ||
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index cbf460927c42..19b14862d3e0 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h | |||
@@ -20,10 +20,11 @@ struct ceph_osd_client; | |||
20 | /* | 20 | /* |
21 | * completion callback for async writepages | 21 | * completion callback for async writepages |
22 | */ | 22 | */ |
23 | typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, | 23 | typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); |
24 | struct ceph_msg *); | ||
25 | typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); | 24 | typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); |
26 | 25 | ||
26 | #define CEPH_HOMELESS_OSD -1 | ||
27 | |||
27 | /* a given osd we're communicating with */ | 28 | /* a given osd we're communicating with */ |
28 | struct ceph_osd { | 29 | struct ceph_osd { |
29 | atomic_t o_ref; | 30 | atomic_t o_ref; |
@@ -32,16 +33,15 @@ struct ceph_osd { | |||
32 | int o_incarnation; | 33 | int o_incarnation; |
33 | struct rb_node o_node; | 34 | struct rb_node o_node; |
34 | struct ceph_connection o_con; | 35 | struct ceph_connection o_con; |
35 | struct list_head o_requests; | 36 | struct rb_root o_requests; |
36 | struct list_head o_linger_requests; | 37 | struct rb_root o_linger_requests; |
37 | struct list_head o_osd_lru; | 38 | struct list_head o_osd_lru; |
38 | struct ceph_auth_handshake o_auth; | 39 | struct ceph_auth_handshake o_auth; |
39 | unsigned long lru_ttl; | 40 | unsigned long lru_ttl; |
40 | int o_marked_for_keepalive; | ||
41 | struct list_head o_keepalive_item; | 41 | struct list_head o_keepalive_item; |
42 | struct mutex lock; | ||
42 | }; | 43 | }; |
43 | 44 | ||
44 | |||
45 | #define CEPH_OSD_SLAB_OPS 2 | 45 | #define CEPH_OSD_SLAB_OPS 2 |
46 | #define CEPH_OSD_MAX_OPS 16 | 46 | #define CEPH_OSD_MAX_OPS 16 |
47 | 47 | ||
@@ -104,76 +104,95 @@ struct ceph_osd_req_op { | |||
104 | struct ceph_osd_data response_data; | 104 | struct ceph_osd_data response_data; |
105 | __u8 class_len; | 105 | __u8 class_len; |
106 | __u8 method_len; | 106 | __u8 method_len; |
107 | __u8 argc; | 107 | u32 indata_len; |
108 | } cls; | 108 | } cls; |
109 | struct { | 109 | struct { |
110 | u64 cookie; | 110 | u64 cookie; |
111 | u64 ver; | 111 | __u8 op; /* CEPH_OSD_WATCH_OP_ */ |
112 | u32 prot_ver; | 112 | u32 gen; |
113 | u32 timeout; | ||
114 | __u8 flag; | ||
115 | } watch; | 113 | } watch; |
116 | struct { | 114 | struct { |
115 | struct ceph_osd_data request_data; | ||
116 | } notify_ack; | ||
117 | struct { | ||
118 | u64 cookie; | ||
119 | struct ceph_osd_data request_data; | ||
120 | struct ceph_osd_data response_data; | ||
121 | } notify; | ||
122 | struct { | ||
117 | u64 expected_object_size; | 123 | u64 expected_object_size; |
118 | u64 expected_write_size; | 124 | u64 expected_write_size; |
119 | } alloc_hint; | 125 | } alloc_hint; |
120 | }; | 126 | }; |
121 | }; | 127 | }; |
122 | 128 | ||
129 | struct ceph_osd_request_target { | ||
130 | struct ceph_object_id base_oid; | ||
131 | struct ceph_object_locator base_oloc; | ||
132 | struct ceph_object_id target_oid; | ||
133 | struct ceph_object_locator target_oloc; | ||
134 | |||
135 | struct ceph_pg pgid; | ||
136 | u32 pg_num; | ||
137 | u32 pg_num_mask; | ||
138 | struct ceph_osds acting; | ||
139 | struct ceph_osds up; | ||
140 | int size; | ||
141 | int min_size; | ||
142 | bool sort_bitwise; | ||
143 | |||
144 | unsigned int flags; /* CEPH_OSD_FLAG_* */ | ||
145 | bool paused; | ||
146 | |||
147 | int osd; | ||
148 | }; | ||
149 | |||
123 | /* an in-flight request */ | 150 | /* an in-flight request */ |
124 | struct ceph_osd_request { | 151 | struct ceph_osd_request { |
125 | u64 r_tid; /* unique for this client */ | 152 | u64 r_tid; /* unique for this client */ |
126 | struct rb_node r_node; | 153 | struct rb_node r_node; |
127 | struct list_head r_req_lru_item; | 154 | struct rb_node r_mc_node; /* map check */ |
128 | struct list_head r_osd_item; | ||
129 | struct list_head r_linger_item; | ||
130 | struct list_head r_linger_osd_item; | ||
131 | struct ceph_osd *r_osd; | 155 | struct ceph_osd *r_osd; |
132 | struct ceph_pg r_pgid; | 156 | |
133 | int r_pg_osds[CEPH_PG_MAX_SIZE]; | 157 | struct ceph_osd_request_target r_t; |
134 | int r_num_pg_osds; | 158 | #define r_base_oid r_t.base_oid |
159 | #define r_base_oloc r_t.base_oloc | ||
160 | #define r_flags r_t.flags | ||
135 | 161 | ||
136 | struct ceph_msg *r_request, *r_reply; | 162 | struct ceph_msg *r_request, *r_reply; |
137 | int r_flags; /* any additional flags for the osd */ | ||
138 | u32 r_sent; /* >0 if r_request is sending/sent */ | 163 | u32 r_sent; /* >0 if r_request is sending/sent */ |
139 | 164 | ||
140 | /* request osd ops array */ | 165 | /* request osd ops array */ |
141 | unsigned int r_num_ops; | 166 | unsigned int r_num_ops; |
142 | 167 | ||
143 | /* these are updated on each send */ | ||
144 | __le32 *r_request_osdmap_epoch; | ||
145 | __le32 *r_request_flags; | ||
146 | __le64 *r_request_pool; | ||
147 | void *r_request_pgid; | ||
148 | __le32 *r_request_attempts; | ||
149 | bool r_paused; | ||
150 | struct ceph_eversion *r_request_reassert_version; | ||
151 | |||
152 | int r_result; | 168 | int r_result; |
153 | int r_got_reply; | 169 | bool r_got_reply; |
154 | int r_linger; | ||
155 | 170 | ||
156 | struct ceph_osd_client *r_osdc; | 171 | struct ceph_osd_client *r_osdc; |
157 | struct kref r_kref; | 172 | struct kref r_kref; |
158 | bool r_mempool; | 173 | bool r_mempool; |
159 | struct completion r_completion, r_safe_completion; | 174 | struct completion r_completion; |
175 | struct completion r_safe_completion; /* fsync waiter */ | ||
160 | ceph_osdc_callback_t r_callback; | 176 | ceph_osdc_callback_t r_callback; |
161 | ceph_osdc_unsafe_callback_t r_unsafe_callback; | 177 | ceph_osdc_unsafe_callback_t r_unsafe_callback; |
162 | struct ceph_eversion r_reassert_version; | ||
163 | struct list_head r_unsafe_item; | 178 | struct list_head r_unsafe_item; |
164 | 179 | ||
165 | struct inode *r_inode; /* for use by callbacks */ | 180 | struct inode *r_inode; /* for use by callbacks */ |
166 | void *r_priv; /* ditto */ | 181 | void *r_priv; /* ditto */ |
167 | 182 | ||
168 | struct ceph_object_locator r_base_oloc; | 183 | /* set by submitter */ |
169 | struct ceph_object_id r_base_oid; | 184 | u64 r_snapid; /* for reads, CEPH_NOSNAP o/w */ |
170 | struct ceph_object_locator r_target_oloc; | 185 | struct ceph_snap_context *r_snapc; /* for writes */ |
171 | struct ceph_object_id r_target_oid; | 186 | struct timespec r_mtime; /* ditto */ |
172 | 187 | u64 r_data_offset; /* ditto */ | |
173 | u64 r_snapid; | 188 | bool r_linger; /* don't resend on failure */ |
174 | unsigned long r_stamp; /* send OR check time */ | ||
175 | 189 | ||
176 | struct ceph_snap_context *r_snapc; /* snap context for writes */ | 190 | /* internal */ |
191 | unsigned long r_stamp; /* jiffies, send or check time */ | ||
192 | int r_attempts; | ||
193 | struct ceph_eversion r_replay_version; /* aka reassert_version */ | ||
194 | u32 r_last_force_resend; | ||
195 | u32 r_map_dne_bound; | ||
177 | 196 | ||
178 | struct ceph_osd_req_op r_ops[]; | 197 | struct ceph_osd_req_op r_ops[]; |
179 | }; | 198 | }; |
@@ -182,44 +201,70 @@ struct ceph_request_redirect { | |||
182 | struct ceph_object_locator oloc; | 201 | struct ceph_object_locator oloc; |
183 | }; | 202 | }; |
184 | 203 | ||
185 | struct ceph_osd_event { | 204 | typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie, |
186 | u64 cookie; | 205 | u64 notifier_id, void *data, size_t data_len); |
187 | int one_shot; | 206 | typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err); |
207 | |||
208 | struct ceph_osd_linger_request { | ||
188 | struct ceph_osd_client *osdc; | 209 | struct ceph_osd_client *osdc; |
189 | void (*cb)(u64, u64, u8, void *); | 210 | u64 linger_id; |
190 | void *data; | 211 | bool committed; |
191 | struct rb_node node; | 212 | bool is_watch; /* watch or notify */ |
192 | struct list_head osd_node; | 213 | |
214 | struct ceph_osd *osd; | ||
215 | struct ceph_osd_request *reg_req; | ||
216 | struct ceph_osd_request *ping_req; | ||
217 | unsigned long ping_sent; | ||
218 | unsigned long watch_valid_thru; | ||
219 | struct list_head pending_lworks; | ||
220 | |||
221 | struct ceph_osd_request_target t; | ||
222 | u32 last_force_resend; | ||
223 | u32 map_dne_bound; | ||
224 | |||
225 | struct timespec mtime; | ||
226 | |||
193 | struct kref kref; | 227 | struct kref kref; |
194 | }; | 228 | struct mutex lock; |
229 | struct rb_node node; /* osd */ | ||
230 | struct rb_node osdc_node; /* osdc */ | ||
231 | struct rb_node mc_node; /* map check */ | ||
232 | struct list_head scan_item; | ||
233 | |||
234 | struct completion reg_commit_wait; | ||
235 | struct completion notify_finish_wait; | ||
236 | int reg_commit_error; | ||
237 | int notify_finish_error; | ||
238 | int last_error; | ||
239 | |||
240 | u32 register_gen; | ||
241 | u64 notify_id; | ||
242 | |||
243 | rados_watchcb2_t wcb; | ||
244 | rados_watcherrcb_t errcb; | ||
245 | void *data; | ||
195 | 246 | ||
196 | struct ceph_osd_event_work { | 247 | struct page ***preply_pages; |
197 | struct work_struct work; | 248 | size_t *preply_len; |
198 | struct ceph_osd_event *event; | ||
199 | u64 ver; | ||
200 | u64 notify_id; | ||
201 | u8 opcode; | ||
202 | }; | 249 | }; |
203 | 250 | ||
204 | struct ceph_osd_client { | 251 | struct ceph_osd_client { |
205 | struct ceph_client *client; | 252 | struct ceph_client *client; |
206 | 253 | ||
207 | struct ceph_osdmap *osdmap; /* current map */ | 254 | struct ceph_osdmap *osdmap; /* current map */ |
208 | struct rw_semaphore map_sem; | 255 | struct rw_semaphore lock; |
209 | struct completion map_waiters; | ||
210 | u64 last_requested_map; | ||
211 | 256 | ||
212 | struct mutex request_mutex; | ||
213 | struct rb_root osds; /* osds */ | 257 | struct rb_root osds; /* osds */ |
214 | struct list_head osd_lru; /* idle osds */ | 258 | struct list_head osd_lru; /* idle osds */ |
215 | u64 timeout_tid; /* tid of timeout triggering rq */ | 259 | spinlock_t osd_lru_lock; |
216 | u64 last_tid; /* tid of last request */ | 260 | struct ceph_osd homeless_osd; |
217 | struct rb_root requests; /* pending requests */ | 261 | atomic64_t last_tid; /* tid of last request */ |
218 | struct list_head req_lru; /* in-flight lru */ | 262 | u64 last_linger_id; |
219 | struct list_head req_unsent; /* unsent/need-resend queue */ | 263 | struct rb_root linger_requests; /* lingering requests */ |
220 | struct list_head req_notarget; /* map to no osd */ | 264 | struct rb_root map_checks; |
221 | struct list_head req_linger; /* lingering requests */ | 265 | struct rb_root linger_map_checks; |
222 | int num_requests; | 266 | atomic_t num_requests; |
267 | atomic_t num_homeless; | ||
223 | struct delayed_work timeout_work; | 268 | struct delayed_work timeout_work; |
224 | struct delayed_work osds_timeout_work; | 269 | struct delayed_work osds_timeout_work; |
225 | #ifdef CONFIG_DEBUG_FS | 270 | #ifdef CONFIG_DEBUG_FS |
@@ -231,10 +276,6 @@ struct ceph_osd_client { | |||
231 | struct ceph_msgpool msgpool_op; | 276 | struct ceph_msgpool msgpool_op; |
232 | struct ceph_msgpool msgpool_op_reply; | 277 | struct ceph_msgpool msgpool_op_reply; |
233 | 278 | ||
234 | spinlock_t event_lock; | ||
235 | struct rb_root event_tree; | ||
236 | u64 event_count; | ||
237 | |||
238 | struct workqueue_struct *notify_wq; | 279 | struct workqueue_struct *notify_wq; |
239 | }; | 280 | }; |
240 | 281 | ||
@@ -271,9 +312,6 @@ extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, | |||
271 | extern struct ceph_osd_data *osd_req_op_extent_osd_data( | 312 | extern struct ceph_osd_data *osd_req_op_extent_osd_data( |
272 | struct ceph_osd_request *osd_req, | 313 | struct ceph_osd_request *osd_req, |
273 | unsigned int which); | 314 | unsigned int which); |
274 | extern struct ceph_osd_data *osd_req_op_cls_response_data( | ||
275 | struct ceph_osd_request *osd_req, | ||
276 | unsigned int which); | ||
277 | 315 | ||
278 | extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, | 316 | extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, |
279 | unsigned int which, | 317 | unsigned int which, |
@@ -309,9 +347,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, | |||
309 | extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, | 347 | extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, |
310 | u16 opcode, const char *name, const void *value, | 348 | u16 opcode, const char *name, const void *value, |
311 | size_t size, u8 cmp_op, u8 cmp_mode); | 349 | size_t size, u8 cmp_op, u8 cmp_mode); |
312 | extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, | ||
313 | unsigned int which, u16 opcode, | ||
314 | u64 cookie, u64 version, int flag); | ||
315 | extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, | 350 | extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, |
316 | unsigned int which, | 351 | unsigned int which, |
317 | u64 expected_object_size, | 352 | u64 expected_object_size, |
@@ -322,11 +357,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * | |||
322 | unsigned int num_ops, | 357 | unsigned int num_ops, |
323 | bool use_mempool, | 358 | bool use_mempool, |
324 | gfp_t gfp_flags); | 359 | gfp_t gfp_flags); |
325 | 360 | int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp); | |
326 | extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, | ||
327 | struct ceph_snap_context *snapc, | ||
328 | u64 snap_id, | ||
329 | struct timespec *mtime); | ||
330 | 361 | ||
331 | extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, | 362 | extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, |
332 | struct ceph_file_layout *layout, | 363 | struct ceph_file_layout *layout, |
@@ -338,9 +369,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, | |||
338 | u32 truncate_seq, u64 truncate_size, | 369 | u32 truncate_seq, u64 truncate_size, |
339 | bool use_mempool); | 370 | bool use_mempool); |
340 | 371 | ||
341 | extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, | ||
342 | struct ceph_osd_request *req); | ||
343 | |||
344 | extern void ceph_osdc_get_request(struct ceph_osd_request *req); | 372 | extern void ceph_osdc_get_request(struct ceph_osd_request *req); |
345 | extern void ceph_osdc_put_request(struct ceph_osd_request *req); | 373 | extern void ceph_osdc_put_request(struct ceph_osd_request *req); |
346 | 374 | ||
@@ -353,6 +381,7 @@ extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, | |||
353 | extern void ceph_osdc_sync(struct ceph_osd_client *osdc); | 381 | extern void ceph_osdc_sync(struct ceph_osd_client *osdc); |
354 | 382 | ||
355 | extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc); | 383 | extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc); |
384 | void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc); | ||
356 | 385 | ||
357 | extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, | 386 | extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, |
358 | struct ceph_vino vino, | 387 | struct ceph_vino vino, |
@@ -371,11 +400,33 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, | |||
371 | struct timespec *mtime, | 400 | struct timespec *mtime, |
372 | struct page **pages, int nr_pages); | 401 | struct page **pages, int nr_pages); |
373 | 402 | ||
374 | /* watch/notify events */ | 403 | /* watch/notify */ |
375 | extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, | 404 | struct ceph_osd_linger_request * |
376 | void (*event_cb)(u64, u64, u8, void *), | 405 | ceph_osdc_watch(struct ceph_osd_client *osdc, |
377 | void *data, struct ceph_osd_event **pevent); | 406 | struct ceph_object_id *oid, |
378 | extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); | 407 | struct ceph_object_locator *oloc, |
379 | extern void ceph_osdc_put_event(struct ceph_osd_event *event); | 408 | rados_watchcb2_t wcb, |
409 | rados_watcherrcb_t errcb, | ||
410 | void *data); | ||
411 | int ceph_osdc_unwatch(struct ceph_osd_client *osdc, | ||
412 | struct ceph_osd_linger_request *lreq); | ||
413 | |||
414 | int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, | ||
415 | struct ceph_object_id *oid, | ||
416 | struct ceph_object_locator *oloc, | ||
417 | u64 notify_id, | ||
418 | u64 cookie, | ||
419 | void *payload, | ||
420 | size_t payload_len); | ||
421 | int ceph_osdc_notify(struct ceph_osd_client *osdc, | ||
422 | struct ceph_object_id *oid, | ||
423 | struct ceph_object_locator *oloc, | ||
424 | void *payload, | ||
425 | size_t payload_len, | ||
426 | u32 timeout, | ||
427 | struct page ***preply_pages, | ||
428 | size_t *preply_len); | ||
429 | int ceph_osdc_watch_check(struct ceph_osd_client *osdc, | ||
430 | struct ceph_osd_linger_request *lreq); | ||
380 | #endif | 431 | #endif |
381 | 432 | ||
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index e55c08bc3a96..ddc426b22d81 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h | |||
@@ -24,21 +24,29 @@ struct ceph_pg { | |||
24 | uint32_t seed; | 24 | uint32_t seed; |
25 | }; | 25 | }; |
26 | 26 | ||
27 | #define CEPH_POOL_FLAG_HASHPSPOOL 1 | 27 | int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); |
28 | |||
29 | #define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id | ||
30 | together */ | ||
31 | #define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */ | ||
28 | 32 | ||
29 | struct ceph_pg_pool_info { | 33 | struct ceph_pg_pool_info { |
30 | struct rb_node node; | 34 | struct rb_node node; |
31 | s64 id; | 35 | s64 id; |
32 | u8 type; | 36 | u8 type; /* CEPH_POOL_TYPE_* */ |
33 | u8 size; | 37 | u8 size; |
38 | u8 min_size; | ||
34 | u8 crush_ruleset; | 39 | u8 crush_ruleset; |
35 | u8 object_hash; | 40 | u8 object_hash; |
41 | u32 last_force_request_resend; | ||
36 | u32 pg_num, pgp_num; | 42 | u32 pg_num, pgp_num; |
37 | int pg_num_mask, pgp_num_mask; | 43 | int pg_num_mask, pgp_num_mask; |
38 | s64 read_tier; | 44 | s64 read_tier; |
39 | s64 write_tier; /* wins for read+write ops */ | 45 | s64 write_tier; /* wins for read+write ops */ |
40 | u64 flags; | 46 | u64 flags; /* CEPH_POOL_FLAG_* */ |
41 | char *name; | 47 | char *name; |
48 | |||
49 | bool was_full; /* for handle_one_map() */ | ||
42 | }; | 50 | }; |
43 | 51 | ||
44 | static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) | 52 | static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) |
@@ -57,6 +65,22 @@ struct ceph_object_locator { | |||
57 | s64 pool; | 65 | s64 pool; |
58 | }; | 66 | }; |
59 | 67 | ||
68 | static inline void ceph_oloc_init(struct ceph_object_locator *oloc) | ||
69 | { | ||
70 | oloc->pool = -1; | ||
71 | } | ||
72 | |||
73 | static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc) | ||
74 | { | ||
75 | return oloc->pool == -1; | ||
76 | } | ||
77 | |||
78 | static inline void ceph_oloc_copy(struct ceph_object_locator *dest, | ||
79 | const struct ceph_object_locator *src) | ||
80 | { | ||
81 | dest->pool = src->pool; | ||
82 | } | ||
83 | |||
60 | /* | 84 | /* |
61 | * Maximum supported by kernel client object name length | 85 | * Maximum supported by kernel client object name length |
62 | * | 86 | * |
@@ -64,11 +88,47 @@ struct ceph_object_locator { | |||
64 | */ | 88 | */ |
65 | #define CEPH_MAX_OID_NAME_LEN 100 | 89 | #define CEPH_MAX_OID_NAME_LEN 100 |
66 | 90 | ||
91 | /* | ||
92 | * 51-char inline_name is long enough for all cephfs and all but one | ||
93 | * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be | ||
94 | * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all | ||
95 | * other rbd requests fit into inline_name. | ||
96 | * | ||
97 | * Makes ceph_object_id 64 bytes on 64-bit. | ||
98 | */ | ||
99 | #define CEPH_OID_INLINE_LEN 52 | ||
100 | |||
101 | /* | ||
102 | * Both inline and external buffers have space for a NUL-terminator, | ||
103 | * which is carried around. It's not required though - RADOS object | ||
104 | * names don't have to be NUL-terminated and may contain NULs. | ||
105 | */ | ||
67 | struct ceph_object_id { | 106 | struct ceph_object_id { |
68 | char name[CEPH_MAX_OID_NAME_LEN]; | 107 | char *name; |
108 | char inline_name[CEPH_OID_INLINE_LEN]; | ||
69 | int name_len; | 109 | int name_len; |
70 | }; | 110 | }; |
71 | 111 | ||
112 | static inline void ceph_oid_init(struct ceph_object_id *oid) | ||
113 | { | ||
114 | oid->name = oid->inline_name; | ||
115 | oid->name_len = 0; | ||
116 | } | ||
117 | |||
118 | static inline bool ceph_oid_empty(const struct ceph_object_id *oid) | ||
119 | { | ||
120 | return oid->name == oid->inline_name && !oid->name_len; | ||
121 | } | ||
122 | |||
123 | void ceph_oid_copy(struct ceph_object_id *dest, | ||
124 | const struct ceph_object_id *src); | ||
125 | __printf(2, 3) | ||
126 | void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...); | ||
127 | __printf(3, 4) | ||
128 | int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp, | ||
129 | const char *fmt, ...); | ||
130 | void ceph_oid_destroy(struct ceph_object_id *oid); | ||
131 | |||
72 | struct ceph_pg_mapping { | 132 | struct ceph_pg_mapping { |
73 | struct rb_node node; | 133 | struct rb_node node; |
74 | struct ceph_pg pgid; | 134 | struct ceph_pg pgid; |
@@ -87,7 +147,6 @@ struct ceph_pg_mapping { | |||
87 | struct ceph_osdmap { | 147 | struct ceph_osdmap { |
88 | struct ceph_fsid fsid; | 148 | struct ceph_fsid fsid; |
89 | u32 epoch; | 149 | u32 epoch; |
90 | u32 mkfs_epoch; | ||
91 | struct ceph_timespec created, modified; | 150 | struct ceph_timespec created, modified; |
92 | 151 | ||
93 | u32 flags; /* CEPH_OSDMAP_* */ | 152 | u32 flags; /* CEPH_OSDMAP_* */ |
@@ -113,43 +172,19 @@ struct ceph_osdmap { | |||
113 | int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; | 172 | int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; |
114 | }; | 173 | }; |
115 | 174 | ||
116 | static inline void ceph_oid_set_name(struct ceph_object_id *oid, | 175 | static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd) |
117 | const char *name) | ||
118 | { | ||
119 | int len; | ||
120 | |||
121 | len = strlen(name); | ||
122 | if (len > sizeof(oid->name)) { | ||
123 | WARN(1, "ceph_oid_set_name '%s' len %d vs %zu, truncating\n", | ||
124 | name, len, sizeof(oid->name)); | ||
125 | len = sizeof(oid->name); | ||
126 | } | ||
127 | |||
128 | memcpy(oid->name, name, len); | ||
129 | oid->name_len = len; | ||
130 | } | ||
131 | |||
132 | static inline void ceph_oid_copy(struct ceph_object_id *dest, | ||
133 | struct ceph_object_id *src) | ||
134 | { | ||
135 | BUG_ON(src->name_len > sizeof(dest->name)); | ||
136 | memcpy(dest->name, src->name, src->name_len); | ||
137 | dest->name_len = src->name_len; | ||
138 | } | ||
139 | |||
140 | static inline int ceph_osd_exists(struct ceph_osdmap *map, int osd) | ||
141 | { | 176 | { |
142 | return osd >= 0 && osd < map->max_osd && | 177 | return osd >= 0 && osd < map->max_osd && |
143 | (map->osd_state[osd] & CEPH_OSD_EXISTS); | 178 | (map->osd_state[osd] & CEPH_OSD_EXISTS); |
144 | } | 179 | } |
145 | 180 | ||
146 | static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd) | 181 | static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd) |
147 | { | 182 | { |
148 | return ceph_osd_exists(map, osd) && | 183 | return ceph_osd_exists(map, osd) && |
149 | (map->osd_state[osd] & CEPH_OSD_UP); | 184 | (map->osd_state[osd] & CEPH_OSD_UP); |
150 | } | 185 | } |
151 | 186 | ||
152 | static inline int ceph_osd_is_down(struct ceph_osdmap *map, int osd) | 187 | static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd) |
153 | { | 188 | { |
154 | return !ceph_osd_is_up(map, osd); | 189 | return !ceph_osd_is_up(map, osd); |
155 | } | 190 | } |
@@ -192,28 +227,59 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) | |||
192 | return 0; | 227 | return 0; |
193 | } | 228 | } |
194 | 229 | ||
230 | struct ceph_osdmap *ceph_osdmap_alloc(void); | ||
195 | extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); | 231 | extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end); |
196 | extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | 232 | struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, |
197 | struct ceph_osdmap *map, | 233 | struct ceph_osdmap *map); |
198 | struct ceph_messenger *msgr); | ||
199 | extern void ceph_osdmap_destroy(struct ceph_osdmap *map); | 234 | extern void ceph_osdmap_destroy(struct ceph_osdmap *map); |
200 | 235 | ||
236 | struct ceph_osds { | ||
237 | int osds[CEPH_PG_MAX_SIZE]; | ||
238 | int size; | ||
239 | int primary; /* id, NOT index */ | ||
240 | }; | ||
241 | |||
242 | static inline void ceph_osds_init(struct ceph_osds *set) | ||
243 | { | ||
244 | set->size = 0; | ||
245 | set->primary = -1; | ||
246 | } | ||
247 | |||
248 | void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src); | ||
249 | |||
250 | bool ceph_is_new_interval(const struct ceph_osds *old_acting, | ||
251 | const struct ceph_osds *new_acting, | ||
252 | const struct ceph_osds *old_up, | ||
253 | const struct ceph_osds *new_up, | ||
254 | int old_size, | ||
255 | int new_size, | ||
256 | int old_min_size, | ||
257 | int new_min_size, | ||
258 | u32 old_pg_num, | ||
259 | u32 new_pg_num, | ||
260 | bool old_sort_bitwise, | ||
261 | bool new_sort_bitwise, | ||
262 | const struct ceph_pg *pgid); | ||
263 | bool ceph_osds_changed(const struct ceph_osds *old_acting, | ||
264 | const struct ceph_osds *new_acting, | ||
265 | bool any_change); | ||
266 | |||
201 | /* calculate mapping of a file extent to an object */ | 267 | /* calculate mapping of a file extent to an object */ |
202 | extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, | 268 | extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, |
203 | u64 off, u64 len, | 269 | u64 off, u64 len, |
204 | u64 *bno, u64 *oxoff, u64 *oxlen); | 270 | u64 *bno, u64 *oxoff, u64 *oxlen); |
205 | 271 | ||
206 | /* calculate mapping of object to a placement group */ | 272 | int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, |
207 | extern int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, | 273 | struct ceph_object_id *oid, |
208 | struct ceph_object_locator *oloc, | 274 | struct ceph_object_locator *oloc, |
209 | struct ceph_object_id *oid, | 275 | struct ceph_pg *raw_pgid); |
210 | struct ceph_pg *pg_out); | 276 | |
211 | 277 | void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, | |
212 | extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, | 278 | const struct ceph_pg *raw_pgid, |
213 | struct ceph_pg pgid, | 279 | struct ceph_osds *up, |
214 | int *osds, int *primary); | 280 | struct ceph_osds *acting); |
215 | extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, | 281 | int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, |
216 | struct ceph_pg pgid); | 282 | const struct ceph_pg *raw_pgid); |
217 | 283 | ||
218 | extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, | 284 | extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, |
219 | u64 id); | 285 | u64 id); |
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 2f822dca1046..5c0da61cb763 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h | |||
@@ -114,8 +114,8 @@ struct ceph_object_layout { | |||
114 | * compound epoch+version, used by storage layer to serialize mutations | 114 | * compound epoch+version, used by storage layer to serialize mutations |
115 | */ | 115 | */ |
116 | struct ceph_eversion { | 116 | struct ceph_eversion { |
117 | __le32 epoch; | ||
118 | __le64 version; | 117 | __le64 version; |
118 | __le32 epoch; | ||
119 | } __attribute__ ((packed)); | 119 | } __attribute__ ((packed)); |
120 | 120 | ||
121 | /* | 121 | /* |
@@ -153,6 +153,11 @@ extern const char *ceph_osd_state_name(int s); | |||
153 | #define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ | 153 | #define CEPH_OSDMAP_NOIN (1<<8) /* block osd auto mark-in */ |
154 | #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ | 154 | #define CEPH_OSDMAP_NOBACKFILL (1<<9) /* block osd backfill */ |
155 | #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ | 155 | #define CEPH_OSDMAP_NORECOVER (1<<10) /* block osd recovery and backfill */ |
156 | #define CEPH_OSDMAP_NOSCRUB (1<<11) /* block periodic scrub */ | ||
157 | #define CEPH_OSDMAP_NODEEP_SCRUB (1<<12) /* block periodic deep-scrub */ | ||
158 | #define CEPH_OSDMAP_NOTIERAGENT (1<<13) /* disable tiering agent */ | ||
159 | #define CEPH_OSDMAP_NOREBALANCE (1<<14) /* block osd backfill unless pg is degraded */ | ||
160 | #define CEPH_OSDMAP_SORTBITWISE (1<<15) /* use bitwise hobject_t sort */ | ||
156 | 161 | ||
157 | /* | 162 | /* |
158 | * The error code to return when an OSD can't handle a write | 163 | * The error code to return when an OSD can't handle a write |
@@ -389,6 +394,13 @@ enum { | |||
389 | CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ | 394 | CEPH_OSD_FLAG_SKIPRWLOCKS = 0x10000, /* skip rw locks */ |
390 | CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */ | 395 | CEPH_OSD_FLAG_IGNORE_OVERLAY = 0x20000, /* ignore pool overlay */ |
391 | CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ | 396 | CEPH_OSD_FLAG_FLUSH = 0x40000, /* this is part of flush */ |
397 | CEPH_OSD_FLAG_MAP_SNAP_CLONE = 0x80000, /* map snap direct to clone id */ | ||
398 | CEPH_OSD_FLAG_ENFORCE_SNAPC = 0x100000, /* use snapc provided even if | ||
399 | pool uses pool snaps */ | ||
400 | CEPH_OSD_FLAG_REDIRECTED = 0x200000, /* op has been redirected */ | ||
401 | CEPH_OSD_FLAG_KNOWN_REDIR = 0x400000, /* redirect bit is authoritative */ | ||
402 | CEPH_OSD_FLAG_FULL_TRY = 0x800000, /* try op despite full flag */ | ||
403 | CEPH_OSD_FLAG_FULL_FORCE = 0x1000000, /* force op despite full flag */ | ||
392 | }; | 404 | }; |
393 | 405 | ||
394 | enum { | 406 | enum { |
@@ -415,7 +427,17 @@ enum { | |||
415 | CEPH_OSD_CMPXATTR_MODE_U64 = 2 | 427 | CEPH_OSD_CMPXATTR_MODE_U64 = 2 |
416 | }; | 428 | }; |
417 | 429 | ||
418 | #define RADOS_NOTIFY_VER 1 | 430 | enum { |
431 | CEPH_OSD_WATCH_OP_UNWATCH = 0, | ||
432 | CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1, | ||
433 | /* note: use only ODD ids to prevent pre-giant code from | ||
434 | interpreting the op as UNWATCH */ | ||
435 | CEPH_OSD_WATCH_OP_WATCH = 3, | ||
436 | CEPH_OSD_WATCH_OP_RECONNECT = 5, | ||
437 | CEPH_OSD_WATCH_OP_PING = 7, | ||
438 | }; | ||
439 | |||
440 | const char *ceph_osd_watch_op_name(int o); | ||
419 | 441 | ||
420 | /* | 442 | /* |
421 | * an individual object operation. each may be accompanied by some data | 443 | * an individual object operation. each may be accompanied by some data |
@@ -450,10 +472,14 @@ struct ceph_osd_op { | |||
450 | } __attribute__ ((packed)) snap; | 472 | } __attribute__ ((packed)) snap; |
451 | struct { | 473 | struct { |
452 | __le64 cookie; | 474 | __le64 cookie; |
453 | __le64 ver; | 475 | __le64 ver; /* no longer used */ |
454 | __u8 flag; /* 0 = unwatch, 1 = watch */ | 476 | __u8 op; /* CEPH_OSD_WATCH_OP_* */ |
477 | __le32 gen; /* registration generation */ | ||
455 | } __attribute__ ((packed)) watch; | 478 | } __attribute__ ((packed)) watch; |
456 | struct { | 479 | struct { |
480 | __le64 cookie; | ||
481 | } __attribute__ ((packed)) notify; | ||
482 | struct { | ||
457 | __le64 offset, length; | 483 | __le64 offset, length; |
458 | __le64 src_offset; | 484 | __le64 src_offset; |
459 | } __attribute__ ((packed)) clonerange; | 485 | } __attribute__ ((packed)) clonerange; |
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index dcc18c6f7cf9..55d2bfee16d7 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c | |||
@@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client); | |||
651 | /* | 651 | /* |
652 | * true if we have the mon map (and have thus joined the cluster) | 652 | * true if we have the mon map (and have thus joined the cluster) |
653 | */ | 653 | */ |
654 | static int have_mon_and_osd_map(struct ceph_client *client) | 654 | static bool have_mon_and_osd_map(struct ceph_client *client) |
655 | { | 655 | { |
656 | return client->monc.monmap && client->monc.monmap->epoch && | 656 | return client->monc.monmap && client->monc.monmap->epoch && |
657 | client->osdc.osdmap && client->osdc.osdmap->epoch; | 657 | client->osdc.osdmap && client->osdc.osdmap->epoch; |
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c index 139a9cb19b0c..3773a4fa11e3 100644 --- a/net/ceph/ceph_strings.c +++ b/net/ceph/ceph_strings.c | |||
@@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE) | |||
27 | } | 27 | } |
28 | } | 28 | } |
29 | 29 | ||
30 | const char *ceph_osd_watch_op_name(int o) | ||
31 | { | ||
32 | switch (o) { | ||
33 | case CEPH_OSD_WATCH_OP_UNWATCH: | ||
34 | return "unwatch"; | ||
35 | case CEPH_OSD_WATCH_OP_WATCH: | ||
36 | return "watch"; | ||
37 | case CEPH_OSD_WATCH_OP_RECONNECT: | ||
38 | return "reconnect"; | ||
39 | case CEPH_OSD_WATCH_OP_PING: | ||
40 | return "ping"; | ||
41 | default: | ||
42 | return "???"; | ||
43 | } | ||
44 | } | ||
45 | |||
30 | const char *ceph_osd_state_name(int s) | 46 | const char *ceph_osd_state_name(int s) |
31 | { | 47 | { |
32 | switch (s) { | 48 | switch (s) { |
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index b902fbc7863e..e77b04ca7802 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c | |||
@@ -54,24 +54,25 @@ static int osdmap_show(struct seq_file *s, void *p) | |||
54 | { | 54 | { |
55 | int i; | 55 | int i; |
56 | struct ceph_client *client = s->private; | 56 | struct ceph_client *client = s->private; |
57 | struct ceph_osdmap *map = client->osdc.osdmap; | 57 | struct ceph_osd_client *osdc = &client->osdc; |
58 | struct ceph_osdmap *map = osdc->osdmap; | ||
58 | struct rb_node *n; | 59 | struct rb_node *n; |
59 | 60 | ||
60 | if (map == NULL) | 61 | if (map == NULL) |
61 | return 0; | 62 | return 0; |
62 | 63 | ||
63 | seq_printf(s, "epoch %d\n", map->epoch); | 64 | down_read(&osdc->lock); |
64 | seq_printf(s, "flags%s%s\n", | 65 | seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags); |
65 | (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "", | ||
66 | (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : ""); | ||
67 | 66 | ||
68 | for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { | 67 | for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { |
69 | struct ceph_pg_pool_info *pool = | 68 | struct ceph_pg_pool_info *pi = |
70 | rb_entry(n, struct ceph_pg_pool_info, node); | 69 | rb_entry(n, struct ceph_pg_pool_info, node); |
71 | 70 | ||
72 | seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n", | 71 | seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n", |
73 | pool->id, pool->pg_num, pool->pg_num_mask, | 72 | pi->id, pi->name, pi->type, pi->size, pi->min_size, |
74 | pool->read_tier, pool->write_tier); | 73 | pi->pg_num, pi->pg_num_mask, pi->flags, |
74 | pi->last_force_request_resend, pi->read_tier, | ||
75 | pi->write_tier); | ||
75 | } | 76 | } |
76 | for (i = 0; i < map->max_osd; i++) { | 77 | for (i = 0; i < map->max_osd; i++) { |
77 | struct ceph_entity_addr *addr = &map->osd_addr[i]; | 78 | struct ceph_entity_addr *addr = &map->osd_addr[i]; |
@@ -103,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p) | |||
103 | pg->pgid.seed, pg->primary_temp.osd); | 104 | pg->pgid.seed, pg->primary_temp.osd); |
104 | } | 105 | } |
105 | 106 | ||
107 | up_read(&osdc->lock); | ||
106 | return 0; | 108 | return 0; |
107 | } | 109 | } |
108 | 110 | ||
@@ -126,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p) | |||
126 | CEPH_SUBSCRIBE_ONETIME ? "" : "+")); | 128 | CEPH_SUBSCRIBE_ONETIME ? "" : "+")); |
127 | seq_putc(s, '\n'); | 129 | seq_putc(s, '\n'); |
128 | } | 130 | } |
131 | seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id); | ||
129 | 132 | ||
130 | for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { | 133 | for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { |
131 | __u16 op; | 134 | __u16 op; |
@@ -143,43 +146,113 @@ static int monc_show(struct seq_file *s, void *p) | |||
143 | return 0; | 146 | return 0; |
144 | } | 147 | } |
145 | 148 | ||
146 | static int osdc_show(struct seq_file *s, void *pp) | 149 | static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t) |
147 | { | 150 | { |
148 | struct ceph_client *client = s->private; | 151 | int i; |
149 | struct ceph_osd_client *osdc = &client->osdc; | ||
150 | struct rb_node *p; | ||
151 | 152 | ||
152 | mutex_lock(&osdc->request_mutex); | 153 | seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed); |
153 | for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { | 154 | for (i = 0; i < t->up.size; i++) |
154 | struct ceph_osd_request *req; | 155 | seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]); |
155 | unsigned int i; | 156 | seq_printf(s, "]/%d\t[", t->up.primary); |
156 | int opcode; | 157 | for (i = 0; i < t->acting.size; i++) |
158 | seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]); | ||
159 | seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary, | ||
160 | t->target_oid.name_len, t->target_oid.name, t->flags); | ||
161 | if (t->paused) | ||
162 | seq_puts(s, "\tP"); | ||
163 | } | ||
157 | 164 | ||
158 | req = rb_entry(p, struct ceph_osd_request, r_node); | 165 | static void dump_request(struct seq_file *s, struct ceph_osd_request *req) |
166 | { | ||
167 | int i; | ||
159 | 168 | ||
160 | seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid, | 169 | seq_printf(s, "%llu\t", req->r_tid); |
161 | req->r_osd ? req->r_osd->o_osd : -1, | 170 | dump_target(s, &req->r_t); |
162 | req->r_pgid.pool, req->r_pgid.seed); | ||
163 | 171 | ||
164 | seq_printf(s, "%.*s", req->r_base_oid.name_len, | 172 | seq_printf(s, "\t%d\t%u'%llu", req->r_attempts, |
165 | req->r_base_oid.name); | 173 | le32_to_cpu(req->r_replay_version.epoch), |
174 | le64_to_cpu(req->r_replay_version.version)); | ||
166 | 175 | ||
167 | if (req->r_reassert_version.epoch) | 176 | for (i = 0; i < req->r_num_ops; i++) { |
168 | seq_printf(s, "\t%u'%llu", | 177 | struct ceph_osd_req_op *op = &req->r_ops[i]; |
169 | (unsigned int)le32_to_cpu(req->r_reassert_version.epoch), | 178 | |
170 | le64_to_cpu(req->r_reassert_version.version)); | 179 | seq_printf(s, "%s%s", (i == 0 ? "\t" : ","), |
171 | else | 180 | ceph_osd_op_name(op->op)); |
172 | seq_printf(s, "\t"); | 181 | if (op->op == CEPH_OSD_OP_WATCH) |
182 | seq_printf(s, "-%s", | ||
183 | ceph_osd_watch_op_name(op->watch.op)); | ||
184 | } | ||
185 | |||
186 | seq_putc(s, '\n'); | ||
187 | } | ||
188 | |||
189 | static void dump_requests(struct seq_file *s, struct ceph_osd *osd) | ||
190 | { | ||
191 | struct rb_node *n; | ||
192 | |||
193 | mutex_lock(&osd->lock); | ||
194 | for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) { | ||
195 | struct ceph_osd_request *req = | ||
196 | rb_entry(n, struct ceph_osd_request, r_node); | ||
197 | |||
198 | dump_request(s, req); | ||
199 | } | ||
200 | |||
201 | mutex_unlock(&osd->lock); | ||
202 | } | ||
173 | 203 | ||
174 | for (i = 0; i < req->r_num_ops; i++) { | 204 | static void dump_linger_request(struct seq_file *s, |
175 | opcode = req->r_ops[i].op; | 205 | struct ceph_osd_linger_request *lreq) |
176 | seq_printf(s, "%s%s", (i == 0 ? "\t" : ","), | 206 | { |
177 | ceph_osd_op_name(opcode)); | 207 | seq_printf(s, "%llu\t", lreq->linger_id); |
178 | } | 208 | dump_target(s, &lreq->t); |
209 | |||
210 | seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen, | ||
211 | lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "", | ||
212 | lreq->last_error); | ||
213 | } | ||
214 | |||
215 | static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd) | ||
216 | { | ||
217 | struct rb_node *n; | ||
218 | |||
219 | mutex_lock(&osd->lock); | ||
220 | for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) { | ||
221 | struct ceph_osd_linger_request *lreq = | ||
222 | rb_entry(n, struct ceph_osd_linger_request, node); | ||
223 | |||
224 | dump_linger_request(s, lreq); | ||
225 | } | ||
226 | |||
227 | mutex_unlock(&osd->lock); | ||
228 | } | ||
179 | 229 | ||
180 | seq_printf(s, "\n"); | 230 | static int osdc_show(struct seq_file *s, void *pp) |
231 | { | ||
232 | struct ceph_client *client = s->private; | ||
233 | struct ceph_osd_client *osdc = &client->osdc; | ||
234 | struct rb_node *n; | ||
235 | |||
236 | down_read(&osdc->lock); | ||
237 | seq_printf(s, "REQUESTS %d homeless %d\n", | ||
238 | atomic_read(&osdc->num_requests), | ||
239 | atomic_read(&osdc->num_homeless)); | ||
240 | for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { | ||
241 | struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); | ||
242 | |||
243 | dump_requests(s, osd); | ||
181 | } | 244 | } |
182 | mutex_unlock(&osdc->request_mutex); | 245 | dump_requests(s, &osdc->homeless_osd); |
246 | |||
247 | seq_puts(s, "LINGER REQUESTS\n"); | ||
248 | for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { | ||
249 | struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); | ||
250 | |||
251 | dump_linger_requests(s, osd); | ||
252 | } | ||
253 | dump_linger_requests(s, &osdc->homeless_osd); | ||
254 | |||
255 | up_read(&osdc->lock); | ||
183 | return 0; | 256 | return 0; |
184 | } | 257 | } |
185 | 258 | ||
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index cf638c009cfa..37c38a7fb5c5 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c | |||
@@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc) | |||
260 | BUG_ON(num < 1); /* monmap sub is always there */ | 260 | BUG_ON(num < 1); /* monmap sub is always there */ |
261 | ceph_encode_32(&p, num); | 261 | ceph_encode_32(&p, num); |
262 | for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { | 262 | for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { |
263 | const char *s = ceph_sub_str[i]; | 263 | char buf[32]; |
264 | int len; | ||
264 | 265 | ||
265 | if (!monc->subs[i].want) | 266 | if (!monc->subs[i].want) |
266 | continue; | 267 | continue; |
267 | 268 | ||
268 | dout("%s %s start %llu flags 0x%x\n", __func__, s, | 269 | len = sprintf(buf, "%s", ceph_sub_str[i]); |
270 | if (i == CEPH_SUB_MDSMAP && | ||
271 | monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE) | ||
272 | len += sprintf(buf + len, ".%d", monc->fs_cluster_id); | ||
273 | |||
274 | dout("%s %s start %llu flags 0x%x\n", __func__, buf, | ||
269 | le64_to_cpu(monc->subs[i].item.start), | 275 | le64_to_cpu(monc->subs[i].item.start), |
270 | monc->subs[i].item.flags); | 276 | monc->subs[i].item.flags); |
271 | ceph_encode_string(&p, end, s, strlen(s)); | 277 | ceph_encode_string(&p, end, buf, len); |
272 | memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item)); | 278 | memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item)); |
273 | p += sizeof(monc->subs[i].item); | 279 | p += sizeof(monc->subs[i].item); |
274 | } | 280 | } |
275 | 281 | ||
276 | BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19)); | 282 | BUG_ON(p > end); |
277 | msg->front.iov_len = p - msg->front.iov_base; | 283 | msg->front.iov_len = p - msg->front.iov_base; |
278 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); | 284 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); |
279 | ceph_msg_revoke(msg); | 285 | ceph_msg_revoke(msg); |
@@ -376,19 +382,13 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch) | |||
376 | } | 382 | } |
377 | EXPORT_SYMBOL(ceph_monc_got_map); | 383 | EXPORT_SYMBOL(ceph_monc_got_map); |
378 | 384 | ||
379 | /* | 385 | void ceph_monc_renew_subs(struct ceph_mon_client *monc) |
380 | * Register interest in the next osdmap | ||
381 | */ | ||
382 | void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) | ||
383 | { | 386 | { |
384 | dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have); | ||
385 | mutex_lock(&monc->mutex); | 387 | mutex_lock(&monc->mutex); |
386 | if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, | 388 | __send_subscribe(monc); |
387 | monc->subs[CEPH_SUB_OSDMAP].have + 1, false)) | ||
388 | __send_subscribe(monc); | ||
389 | mutex_unlock(&monc->mutex); | 389 | mutex_unlock(&monc->mutex); |
390 | } | 390 | } |
391 | EXPORT_SYMBOL(ceph_monc_request_next_osdmap); | 391 | EXPORT_SYMBOL(ceph_monc_renew_subs); |
392 | 392 | ||
393 | /* | 393 | /* |
394 | * Wait for an osdmap with a given epoch. | 394 | * Wait for an osdmap with a given epoch. |
@@ -478,51 +478,17 @@ out: | |||
478 | /* | 478 | /* |
479 | * generic requests (currently statfs, mon_get_version) | 479 | * generic requests (currently statfs, mon_get_version) |
480 | */ | 480 | */ |
481 | static struct ceph_mon_generic_request *__lookup_generic_req( | 481 | DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node) |
482 | struct ceph_mon_client *monc, u64 tid) | ||
483 | { | ||
484 | struct ceph_mon_generic_request *req; | ||
485 | struct rb_node *n = monc->generic_request_tree.rb_node; | ||
486 | |||
487 | while (n) { | ||
488 | req = rb_entry(n, struct ceph_mon_generic_request, node); | ||
489 | if (tid < req->tid) | ||
490 | n = n->rb_left; | ||
491 | else if (tid > req->tid) | ||
492 | n = n->rb_right; | ||
493 | else | ||
494 | return req; | ||
495 | } | ||
496 | return NULL; | ||
497 | } | ||
498 | |||
499 | static void __insert_generic_request(struct ceph_mon_client *monc, | ||
500 | struct ceph_mon_generic_request *new) | ||
501 | { | ||
502 | struct rb_node **p = &monc->generic_request_tree.rb_node; | ||
503 | struct rb_node *parent = NULL; | ||
504 | struct ceph_mon_generic_request *req = NULL; | ||
505 | |||
506 | while (*p) { | ||
507 | parent = *p; | ||
508 | req = rb_entry(parent, struct ceph_mon_generic_request, node); | ||
509 | if (new->tid < req->tid) | ||
510 | p = &(*p)->rb_left; | ||
511 | else if (new->tid > req->tid) | ||
512 | p = &(*p)->rb_right; | ||
513 | else | ||
514 | BUG(); | ||
515 | } | ||
516 | |||
517 | rb_link_node(&new->node, parent, p); | ||
518 | rb_insert_color(&new->node, &monc->generic_request_tree); | ||
519 | } | ||
520 | 482 | ||
521 | static void release_generic_request(struct kref *kref) | 483 | static void release_generic_request(struct kref *kref) |
522 | { | 484 | { |
523 | struct ceph_mon_generic_request *req = | 485 | struct ceph_mon_generic_request *req = |
524 | container_of(kref, struct ceph_mon_generic_request, kref); | 486 | container_of(kref, struct ceph_mon_generic_request, kref); |
525 | 487 | ||
488 | dout("%s greq %p request %p reply %p\n", __func__, req, req->request, | ||
489 | req->reply); | ||
490 | WARN_ON(!RB_EMPTY_NODE(&req->node)); | ||
491 | |||
526 | if (req->reply) | 492 | if (req->reply) |
527 | ceph_msg_put(req->reply); | 493 | ceph_msg_put(req->reply); |
528 | if (req->request) | 494 | if (req->request) |
@@ -533,7 +499,8 @@ static void release_generic_request(struct kref *kref) | |||
533 | 499 | ||
534 | static void put_generic_request(struct ceph_mon_generic_request *req) | 500 | static void put_generic_request(struct ceph_mon_generic_request *req) |
535 | { | 501 | { |
536 | kref_put(&req->kref, release_generic_request); | 502 | if (req) |
503 | kref_put(&req->kref, release_generic_request); | ||
537 | } | 504 | } |
538 | 505 | ||
539 | static void get_generic_request(struct ceph_mon_generic_request *req) | 506 | static void get_generic_request(struct ceph_mon_generic_request *req) |
@@ -541,6 +508,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req) | |||
541 | kref_get(&req->kref); | 508 | kref_get(&req->kref); |
542 | } | 509 | } |
543 | 510 | ||
511 | static struct ceph_mon_generic_request * | ||
512 | alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp) | ||
513 | { | ||
514 | struct ceph_mon_generic_request *req; | ||
515 | |||
516 | req = kzalloc(sizeof(*req), gfp); | ||
517 | if (!req) | ||
518 | return NULL; | ||
519 | |||
520 | req->monc = monc; | ||
521 | kref_init(&req->kref); | ||
522 | RB_CLEAR_NODE(&req->node); | ||
523 | init_completion(&req->completion); | ||
524 | |||
525 | dout("%s greq %p\n", __func__, req); | ||
526 | return req; | ||
527 | } | ||
528 | |||
529 | static void register_generic_request(struct ceph_mon_generic_request *req) | ||
530 | { | ||
531 | struct ceph_mon_client *monc = req->monc; | ||
532 | |||
533 | WARN_ON(req->tid); | ||
534 | |||
535 | get_generic_request(req); | ||
536 | req->tid = ++monc->last_tid; | ||
537 | insert_generic_request(&monc->generic_request_tree, req); | ||
538 | } | ||
539 | |||
540 | static void send_generic_request(struct ceph_mon_client *monc, | ||
541 | struct ceph_mon_generic_request *req) | ||
542 | { | ||
543 | WARN_ON(!req->tid); | ||
544 | |||
545 | dout("%s greq %p tid %llu\n", __func__, req, req->tid); | ||
546 | req->request->hdr.tid = cpu_to_le64(req->tid); | ||
547 | ceph_con_send(&monc->con, ceph_msg_get(req->request)); | ||
548 | } | ||
549 | |||
550 | static void __finish_generic_request(struct ceph_mon_generic_request *req) | ||
551 | { | ||
552 | struct ceph_mon_client *monc = req->monc; | ||
553 | |||
554 | dout("%s greq %p tid %llu\n", __func__, req, req->tid); | ||
555 | erase_generic_request(&monc->generic_request_tree, req); | ||
556 | |||
557 | ceph_msg_revoke(req->request); | ||
558 | ceph_msg_revoke_incoming(req->reply); | ||
559 | } | ||
560 | |||
561 | static void finish_generic_request(struct ceph_mon_generic_request *req) | ||
562 | { | ||
563 | __finish_generic_request(req); | ||
564 | put_generic_request(req); | ||
565 | } | ||
566 | |||
567 | static void complete_generic_request(struct ceph_mon_generic_request *req) | ||
568 | { | ||
569 | if (req->complete_cb) | ||
570 | req->complete_cb(req); | ||
571 | else | ||
572 | complete_all(&req->completion); | ||
573 | put_generic_request(req); | ||
574 | } | ||
575 | |||
576 | void cancel_generic_request(struct ceph_mon_generic_request *req) | ||
577 | { | ||
578 | struct ceph_mon_client *monc = req->monc; | ||
579 | struct ceph_mon_generic_request *lookup_req; | ||
580 | |||
581 | dout("%s greq %p tid %llu\n", __func__, req, req->tid); | ||
582 | |||
583 | mutex_lock(&monc->mutex); | ||
584 | lookup_req = lookup_generic_request(&monc->generic_request_tree, | ||
585 | req->tid); | ||
586 | if (lookup_req) { | ||
587 | WARN_ON(lookup_req != req); | ||
588 | finish_generic_request(req); | ||
589 | } | ||
590 | |||
591 | mutex_unlock(&monc->mutex); | ||
592 | } | ||
593 | |||
594 | static int wait_generic_request(struct ceph_mon_generic_request *req) | ||
595 | { | ||
596 | int ret; | ||
597 | |||
598 | dout("%s greq %p tid %llu\n", __func__, req, req->tid); | ||
599 | ret = wait_for_completion_interruptible(&req->completion); | ||
600 | if (ret) | ||
601 | cancel_generic_request(req); | ||
602 | else | ||
603 | ret = req->result; /* completed */ | ||
604 | |||
605 | return ret; | ||
606 | } | ||
607 | |||
544 | static struct ceph_msg *get_generic_reply(struct ceph_connection *con, | 608 | static struct ceph_msg *get_generic_reply(struct ceph_connection *con, |
545 | struct ceph_msg_header *hdr, | 609 | struct ceph_msg_header *hdr, |
546 | int *skip) | 610 | int *skip) |
@@ -551,7 +615,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, | |||
551 | struct ceph_msg *m; | 615 | struct ceph_msg *m; |
552 | 616 | ||
553 | mutex_lock(&monc->mutex); | 617 | mutex_lock(&monc->mutex); |
554 | req = __lookup_generic_req(monc, tid); | 618 | req = lookup_generic_request(&monc->generic_request_tree, tid); |
555 | if (!req) { | 619 | if (!req) { |
556 | dout("get_generic_reply %lld dne\n", tid); | 620 | dout("get_generic_reply %lld dne\n", tid); |
557 | *skip = 1; | 621 | *skip = 1; |
@@ -570,42 +634,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, | |||
570 | return m; | 634 | return m; |
571 | } | 635 | } |
572 | 636 | ||
573 | static int __do_generic_request(struct ceph_mon_client *monc, u64 tid, | ||
574 | struct ceph_mon_generic_request *req) | ||
575 | { | ||
576 | int err; | ||
577 | |||
578 | /* register request */ | ||
579 | req->tid = tid != 0 ? tid : ++monc->last_tid; | ||
580 | req->request->hdr.tid = cpu_to_le64(req->tid); | ||
581 | __insert_generic_request(monc, req); | ||
582 | monc->num_generic_requests++; | ||
583 | ceph_con_send(&monc->con, ceph_msg_get(req->request)); | ||
584 | mutex_unlock(&monc->mutex); | ||
585 | |||
586 | err = wait_for_completion_interruptible(&req->completion); | ||
587 | |||
588 | mutex_lock(&monc->mutex); | ||
589 | rb_erase(&req->node, &monc->generic_request_tree); | ||
590 | monc->num_generic_requests--; | ||
591 | |||
592 | if (!err) | ||
593 | err = req->result; | ||
594 | return err; | ||
595 | } | ||
596 | |||
597 | static int do_generic_request(struct ceph_mon_client *monc, | ||
598 | struct ceph_mon_generic_request *req) | ||
599 | { | ||
600 | int err; | ||
601 | |||
602 | mutex_lock(&monc->mutex); | ||
603 | err = __do_generic_request(monc, 0, req); | ||
604 | mutex_unlock(&monc->mutex); | ||
605 | |||
606 | return err; | ||
607 | } | ||
608 | |||
609 | /* | 637 | /* |
610 | * statfs | 638 | * statfs |
611 | */ | 639 | */ |
@@ -616,22 +644,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, | |||
616 | struct ceph_mon_statfs_reply *reply = msg->front.iov_base; | 644 | struct ceph_mon_statfs_reply *reply = msg->front.iov_base; |
617 | u64 tid = le64_to_cpu(msg->hdr.tid); | 645 | u64 tid = le64_to_cpu(msg->hdr.tid); |
618 | 646 | ||
647 | dout("%s msg %p tid %llu\n", __func__, msg, tid); | ||
648 | |||
619 | if (msg->front.iov_len != sizeof(*reply)) | 649 | if (msg->front.iov_len != sizeof(*reply)) |
620 | goto bad; | 650 | goto bad; |
621 | dout("handle_statfs_reply %p tid %llu\n", msg, tid); | ||
622 | 651 | ||
623 | mutex_lock(&monc->mutex); | 652 | mutex_lock(&monc->mutex); |
624 | req = __lookup_generic_req(monc, tid); | 653 | req = lookup_generic_request(&monc->generic_request_tree, tid); |
625 | if (req) { | 654 | if (!req) { |
626 | *(struct ceph_statfs *)req->buf = reply->st; | 655 | mutex_unlock(&monc->mutex); |
627 | req->result = 0; | 656 | return; |
628 | get_generic_request(req); | ||
629 | } | 657 | } |
658 | |||
659 | req->result = 0; | ||
660 | *req->u.st = reply->st; /* struct */ | ||
661 | __finish_generic_request(req); | ||
630 | mutex_unlock(&monc->mutex); | 662 | mutex_unlock(&monc->mutex); |
631 | if (req) { | 663 | |
632 | complete_all(&req->completion); | 664 | complete_generic_request(req); |
633 | put_generic_request(req); | ||
634 | } | ||
635 | return; | 665 | return; |
636 | 666 | ||
637 | bad: | 667 | bad: |
@@ -646,38 +676,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) | |||
646 | { | 676 | { |
647 | struct ceph_mon_generic_request *req; | 677 | struct ceph_mon_generic_request *req; |
648 | struct ceph_mon_statfs *h; | 678 | struct ceph_mon_statfs *h; |
649 | int err; | 679 | int ret = -ENOMEM; |
650 | 680 | ||
651 | req = kzalloc(sizeof(*req), GFP_NOFS); | 681 | req = alloc_generic_request(monc, GFP_NOFS); |
652 | if (!req) | 682 | if (!req) |
653 | return -ENOMEM; | 683 | goto out; |
654 | |||
655 | kref_init(&req->kref); | ||
656 | req->buf = buf; | ||
657 | init_completion(&req->completion); | ||
658 | 684 | ||
659 | err = -ENOMEM; | ||
660 | req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, | 685 | req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, |
661 | true); | 686 | true); |
662 | if (!req->request) | 687 | if (!req->request) |
663 | goto out; | 688 | goto out; |
664 | req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, | 689 | |
665 | true); | 690 | req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true); |
666 | if (!req->reply) | 691 | if (!req->reply) |
667 | goto out; | 692 | goto out; |
668 | 693 | ||
694 | req->u.st = buf; | ||
695 | |||
696 | mutex_lock(&monc->mutex); | ||
697 | register_generic_request(req); | ||
669 | /* fill out request */ | 698 | /* fill out request */ |
670 | h = req->request->front.iov_base; | 699 | h = req->request->front.iov_base; |
671 | h->monhdr.have_version = 0; | 700 | h->monhdr.have_version = 0; |
672 | h->monhdr.session_mon = cpu_to_le16(-1); | 701 | h->monhdr.session_mon = cpu_to_le16(-1); |
673 | h->monhdr.session_mon_tid = 0; | 702 | h->monhdr.session_mon_tid = 0; |
674 | h->fsid = monc->monmap->fsid; | 703 | h->fsid = monc->monmap->fsid; |
704 | send_generic_request(monc, req); | ||
705 | mutex_unlock(&monc->mutex); | ||
675 | 706 | ||
676 | err = do_generic_request(monc, req); | 707 | ret = wait_generic_request(req); |
677 | |||
678 | out: | 708 | out: |
679 | put_generic_request(req); | 709 | put_generic_request(req); |
680 | return err; | 710 | return ret; |
681 | } | 711 | } |
682 | EXPORT_SYMBOL(ceph_monc_do_statfs); | 712 | EXPORT_SYMBOL(ceph_monc_do_statfs); |
683 | 713 | ||
@@ -690,7 +720,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc, | |||
690 | void *end = p + msg->front_alloc_len; | 720 | void *end = p + msg->front_alloc_len; |
691 | u64 handle; | 721 | u64 handle; |
692 | 722 | ||
693 | dout("%s %p tid %llu\n", __func__, msg, tid); | 723 | dout("%s msg %p tid %llu\n", __func__, msg, tid); |
694 | 724 | ||
695 | ceph_decode_need(&p, end, 2*sizeof(u64), bad); | 725 | ceph_decode_need(&p, end, 2*sizeof(u64), bad); |
696 | handle = ceph_decode_64(&p); | 726 | handle = ceph_decode_64(&p); |
@@ -698,77 +728,111 @@ static void handle_get_version_reply(struct ceph_mon_client *monc, | |||
698 | goto bad; | 728 | goto bad; |
699 | 729 | ||
700 | mutex_lock(&monc->mutex); | 730 | mutex_lock(&monc->mutex); |
701 | req = __lookup_generic_req(monc, handle); | 731 | req = lookup_generic_request(&monc->generic_request_tree, handle); |
702 | if (req) { | 732 | if (!req) { |
703 | *(u64 *)req->buf = ceph_decode_64(&p); | 733 | mutex_unlock(&monc->mutex); |
704 | req->result = 0; | 734 | return; |
705 | get_generic_request(req); | ||
706 | } | 735 | } |
736 | |||
737 | req->result = 0; | ||
738 | req->u.newest = ceph_decode_64(&p); | ||
739 | __finish_generic_request(req); | ||
707 | mutex_unlock(&monc->mutex); | 740 | mutex_unlock(&monc->mutex); |
708 | if (req) { | ||
709 | complete_all(&req->completion); | ||
710 | put_generic_request(req); | ||
711 | } | ||
712 | 741 | ||
742 | complete_generic_request(req); | ||
713 | return; | 743 | return; |
744 | |||
714 | bad: | 745 | bad: |
715 | pr_err("corrupt mon_get_version reply, tid %llu\n", tid); | 746 | pr_err("corrupt mon_get_version reply, tid %llu\n", tid); |
716 | ceph_msg_dump(msg); | 747 | ceph_msg_dump(msg); |
717 | } | 748 | } |
718 | 749 | ||
719 | /* | 750 | static struct ceph_mon_generic_request * |
720 | * Send MMonGetVersion and wait for the reply. | 751 | __ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, |
721 | * | 752 | ceph_monc_callback_t cb, u64 private_data) |
722 | * @what: one of "mdsmap", "osdmap" or "monmap" | ||
723 | */ | ||
724 | int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what, | ||
725 | u64 *newest) | ||
726 | { | 753 | { |
727 | struct ceph_mon_generic_request *req; | 754 | struct ceph_mon_generic_request *req; |
728 | void *p, *end; | ||
729 | u64 tid; | ||
730 | int err; | ||
731 | 755 | ||
732 | req = kzalloc(sizeof(*req), GFP_NOFS); | 756 | req = alloc_generic_request(monc, GFP_NOIO); |
733 | if (!req) | 757 | if (!req) |
734 | return -ENOMEM; | 758 | goto err_put_req; |
735 | |||
736 | kref_init(&req->kref); | ||
737 | req->buf = newest; | ||
738 | init_completion(&req->completion); | ||
739 | 759 | ||
740 | req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, | 760 | req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, |
741 | sizeof(u64) + sizeof(u32) + strlen(what), | 761 | sizeof(u64) + sizeof(u32) + strlen(what), |
742 | GFP_NOFS, true); | 762 | GFP_NOIO, true); |
743 | if (!req->request) { | 763 | if (!req->request) |
744 | err = -ENOMEM; | 764 | goto err_put_req; |
745 | goto out; | ||
746 | } | ||
747 | 765 | ||
748 | req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024, | 766 | req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO, |
749 | GFP_NOFS, true); | 767 | true); |
750 | if (!req->reply) { | 768 | if (!req->reply) |
751 | err = -ENOMEM; | 769 | goto err_put_req; |
752 | goto out; | ||
753 | } | ||
754 | 770 | ||
755 | p = req->request->front.iov_base; | 771 | req->complete_cb = cb; |
756 | end = p + req->request->front_alloc_len; | 772 | req->private_data = private_data; |
757 | 773 | ||
758 | /* fill out request */ | ||
759 | mutex_lock(&monc->mutex); | 774 | mutex_lock(&monc->mutex); |
760 | tid = ++monc->last_tid; | 775 | register_generic_request(req); |
761 | ceph_encode_64(&p, tid); /* handle */ | 776 | { |
762 | ceph_encode_string(&p, end, what, strlen(what)); | 777 | void *p = req->request->front.iov_base; |
778 | void *const end = p + req->request->front_alloc_len; | ||
779 | |||
780 | ceph_encode_64(&p, req->tid); /* handle */ | ||
781 | ceph_encode_string(&p, end, what, strlen(what)); | ||
782 | WARN_ON(p != end); | ||
783 | } | ||
784 | send_generic_request(monc, req); | ||
785 | mutex_unlock(&monc->mutex); | ||
763 | 786 | ||
764 | err = __do_generic_request(monc, tid, req); | 787 | return req; |
765 | 788 | ||
766 | mutex_unlock(&monc->mutex); | 789 | err_put_req: |
767 | out: | ||
768 | put_generic_request(req); | 790 | put_generic_request(req); |
769 | return err; | 791 | return ERR_PTR(-ENOMEM); |
792 | } | ||
793 | |||
794 | /* | ||
795 | * Send MMonGetVersion and wait for the reply. | ||
796 | * | ||
797 | * @what: one of "mdsmap", "osdmap" or "monmap" | ||
798 | */ | ||
799 | int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, | ||
800 | u64 *newest) | ||
801 | { | ||
802 | struct ceph_mon_generic_request *req; | ||
803 | int ret; | ||
804 | |||
805 | req = __ceph_monc_get_version(monc, what, NULL, 0); | ||
806 | if (IS_ERR(req)) | ||
807 | return PTR_ERR(req); | ||
808 | |||
809 | ret = wait_generic_request(req); | ||
810 | if (!ret) | ||
811 | *newest = req->u.newest; | ||
812 | |||
813 | put_generic_request(req); | ||
814 | return ret; | ||
770 | } | 815 | } |
771 | EXPORT_SYMBOL(ceph_monc_do_get_version); | 816 | EXPORT_SYMBOL(ceph_monc_get_version); |
817 | |||
818 | /* | ||
819 | * Send MMonGetVersion, | ||
820 | * | ||
821 | * @what: one of "mdsmap", "osdmap" or "monmap" | ||
822 | */ | ||
823 | int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what, | ||
824 | ceph_monc_callback_t cb, u64 private_data) | ||
825 | { | ||
826 | struct ceph_mon_generic_request *req; | ||
827 | |||
828 | req = __ceph_monc_get_version(monc, what, cb, private_data); | ||
829 | if (IS_ERR(req)) | ||
830 | return PTR_ERR(req); | ||
831 | |||
832 | put_generic_request(req); | ||
833 | return 0; | ||
834 | } | ||
835 | EXPORT_SYMBOL(ceph_monc_get_version_async); | ||
772 | 836 | ||
773 | /* | 837 | /* |
774 | * Resend pending generic requests. | 838 | * Resend pending generic requests. |
@@ -890,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) | |||
890 | if (!monc->m_subscribe_ack) | 954 | if (!monc->m_subscribe_ack) |
891 | goto out_auth; | 955 | goto out_auth; |
892 | 956 | ||
893 | monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, | 957 | monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS, |
894 | true); | 958 | true); |
895 | if (!monc->m_subscribe) | 959 | if (!monc->m_subscribe) |
896 | goto out_subscribe_ack; | 960 | goto out_subscribe_ack; |
@@ -914,9 +978,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) | |||
914 | 978 | ||
915 | INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); | 979 | INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); |
916 | monc->generic_request_tree = RB_ROOT; | 980 | monc->generic_request_tree = RB_ROOT; |
917 | monc->num_generic_requests = 0; | ||
918 | monc->last_tid = 0; | 981 | monc->last_tid = 0; |
919 | 982 | ||
983 | monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE; | ||
984 | |||
920 | return 0; | 985 | return 0; |
921 | 986 | ||
922 | out_auth_reply: | 987 | out_auth_reply: |
@@ -954,6 +1019,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc) | |||
954 | 1019 | ||
955 | ceph_auth_destroy(monc->auth); | 1020 | ceph_auth_destroy(monc->auth); |
956 | 1021 | ||
1022 | WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree)); | ||
1023 | |||
957 | ceph_msg_put(monc->m_auth); | 1024 | ceph_msg_put(monc->m_auth); |
958 | ceph_msg_put(monc->m_auth_reply); | 1025 | ceph_msg_put(monc->m_auth_reply); |
959 | ceph_msg_put(monc->m_subscribe); | 1026 | ceph_msg_put(monc->m_subscribe); |
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 40a53a70efdf..0160d7d09a1e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c | |||
@@ -19,25 +19,12 @@ | |||
19 | #include <linux/ceph/auth.h> | 19 | #include <linux/ceph/auth.h> |
20 | #include <linux/ceph/pagelist.h> | 20 | #include <linux/ceph/pagelist.h> |
21 | 21 | ||
22 | #define OSD_OP_FRONT_LEN 4096 | ||
23 | #define OSD_OPREPLY_FRONT_LEN 512 | 22 | #define OSD_OPREPLY_FRONT_LEN 512 |
24 | 23 | ||
25 | static struct kmem_cache *ceph_osd_request_cache; | 24 | static struct kmem_cache *ceph_osd_request_cache; |
26 | 25 | ||
27 | static const struct ceph_connection_operations osd_con_ops; | 26 | static const struct ceph_connection_operations osd_con_ops; |
28 | 27 | ||
29 | static void __send_queued(struct ceph_osd_client *osdc); | ||
30 | static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); | ||
31 | static void __register_request(struct ceph_osd_client *osdc, | ||
32 | struct ceph_osd_request *req); | ||
33 | static void __unregister_request(struct ceph_osd_client *osdc, | ||
34 | struct ceph_osd_request *req); | ||
35 | static void __unregister_linger_request(struct ceph_osd_client *osdc, | ||
36 | struct ceph_osd_request *req); | ||
37 | static void __enqueue_request(struct ceph_osd_request *req); | ||
38 | static void __send_request(struct ceph_osd_client *osdc, | ||
39 | struct ceph_osd_request *req); | ||
40 | |||
41 | /* | 28 | /* |
42 | * Implement client access to distributed object storage cluster. | 29 | * Implement client access to distributed object storage cluster. |
43 | * | 30 | * |
@@ -56,6 +43,52 @@ static void __send_request(struct ceph_osd_client *osdc, | |||
56 | * channel with an OSD is reset. | 43 | * channel with an OSD is reset. |
57 | */ | 44 | */ |
58 | 45 | ||
46 | static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req); | ||
47 | static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req); | ||
48 | static void link_linger(struct ceph_osd *osd, | ||
49 | struct ceph_osd_linger_request *lreq); | ||
50 | static void unlink_linger(struct ceph_osd *osd, | ||
51 | struct ceph_osd_linger_request *lreq); | ||
52 | |||
53 | #if 1 | ||
54 | static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem) | ||
55 | { | ||
56 | bool wrlocked = true; | ||
57 | |||
58 | if (unlikely(down_read_trylock(sem))) { | ||
59 | wrlocked = false; | ||
60 | up_read(sem); | ||
61 | } | ||
62 | |||
63 | return wrlocked; | ||
64 | } | ||
65 | static inline void verify_osdc_locked(struct ceph_osd_client *osdc) | ||
66 | { | ||
67 | WARN_ON(!rwsem_is_locked(&osdc->lock)); | ||
68 | } | ||
69 | static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) | ||
70 | { | ||
71 | WARN_ON(!rwsem_is_wrlocked(&osdc->lock)); | ||
72 | } | ||
73 | static inline void verify_osd_locked(struct ceph_osd *osd) | ||
74 | { | ||
75 | struct ceph_osd_client *osdc = osd->o_osdc; | ||
76 | |||
77 | WARN_ON(!(mutex_is_locked(&osd->lock) && | ||
78 | rwsem_is_locked(&osdc->lock)) && | ||
79 | !rwsem_is_wrlocked(&osdc->lock)); | ||
80 | } | ||
81 | static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) | ||
82 | { | ||
83 | WARN_ON(!mutex_is_locked(&lreq->lock)); | ||
84 | } | ||
85 | #else | ||
86 | static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { } | ||
87 | static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { } | ||
88 | static inline void verify_osd_locked(struct ceph_osd *osd) { } | ||
89 | static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { } | ||
90 | #endif | ||
91 | |||
59 | /* | 92 | /* |
60 | * calculate the mapping of a file extent onto an object, and fill out the | 93 | * calculate the mapping of a file extent onto an object, and fill out the |
61 | * request accordingly. shorten extent as necessary if it crosses an | 94 | * request accordingly. shorten extent as necessary if it crosses an |
@@ -144,14 +177,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, | |||
144 | } | 177 | } |
145 | EXPORT_SYMBOL(osd_req_op_extent_osd_data); | 178 | EXPORT_SYMBOL(osd_req_op_extent_osd_data); |
146 | 179 | ||
147 | struct ceph_osd_data * | ||
148 | osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, | ||
149 | unsigned int which) | ||
150 | { | ||
151 | return osd_req_op_data(osd_req, which, cls, response_data); | ||
152 | } | ||
153 | EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */ | ||
154 | |||
155 | void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req, | 180 | void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req, |
156 | unsigned int which, struct page **pages, | 181 | unsigned int which, struct page **pages, |
157 | u64 length, u32 alignment, | 182 | u64 length, u32 alignment, |
@@ -218,6 +243,8 @@ void osd_req_op_cls_request_data_pagelist( | |||
218 | 243 | ||
219 | osd_data = osd_req_op_data(osd_req, which, cls, request_data); | 244 | osd_data = osd_req_op_data(osd_req, which, cls, request_data); |
220 | ceph_osd_data_pagelist_init(osd_data, pagelist); | 245 | ceph_osd_data_pagelist_init(osd_data, pagelist); |
246 | osd_req->r_ops[which].cls.indata_len += pagelist->length; | ||
247 | osd_req->r_ops[which].indata_len += pagelist->length; | ||
221 | } | 248 | } |
222 | EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); | 249 | EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); |
223 | 250 | ||
@@ -230,6 +257,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, | |||
230 | osd_data = osd_req_op_data(osd_req, which, cls, request_data); | 257 | osd_data = osd_req_op_data(osd_req, which, cls, request_data); |
231 | ceph_osd_data_pages_init(osd_data, pages, length, alignment, | 258 | ceph_osd_data_pages_init(osd_data, pages, length, alignment, |
232 | pages_from_pool, own_pages); | 259 | pages_from_pool, own_pages); |
260 | osd_req->r_ops[which].cls.indata_len += length; | ||
261 | osd_req->r_ops[which].indata_len += length; | ||
233 | } | 262 | } |
234 | EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); | 263 | EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); |
235 | 264 | ||
@@ -302,14 +331,76 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, | |||
302 | case CEPH_OSD_OP_STAT: | 331 | case CEPH_OSD_OP_STAT: |
303 | ceph_osd_data_release(&op->raw_data_in); | 332 | ceph_osd_data_release(&op->raw_data_in); |
304 | break; | 333 | break; |
334 | case CEPH_OSD_OP_NOTIFY_ACK: | ||
335 | ceph_osd_data_release(&op->notify_ack.request_data); | ||
336 | break; | ||
337 | case CEPH_OSD_OP_NOTIFY: | ||
338 | ceph_osd_data_release(&op->notify.request_data); | ||
339 | ceph_osd_data_release(&op->notify.response_data); | ||
340 | break; | ||
305 | default: | 341 | default: |
306 | break; | 342 | break; |
307 | } | 343 | } |
308 | } | 344 | } |
309 | 345 | ||
310 | /* | 346 | /* |
347 | * Assumes @t is zero-initialized. | ||
348 | */ | ||
349 | static void target_init(struct ceph_osd_request_target *t) | ||
350 | { | ||
351 | ceph_oid_init(&t->base_oid); | ||
352 | ceph_oloc_init(&t->base_oloc); | ||
353 | ceph_oid_init(&t->target_oid); | ||
354 | ceph_oloc_init(&t->target_oloc); | ||
355 | |||
356 | ceph_osds_init(&t->acting); | ||
357 | ceph_osds_init(&t->up); | ||
358 | t->size = -1; | ||
359 | t->min_size = -1; | ||
360 | |||
361 | t->osd = CEPH_HOMELESS_OSD; | ||
362 | } | ||
363 | |||
364 | static void target_copy(struct ceph_osd_request_target *dest, | ||
365 | const struct ceph_osd_request_target *src) | ||
366 | { | ||
367 | ceph_oid_copy(&dest->base_oid, &src->base_oid); | ||
368 | ceph_oloc_copy(&dest->base_oloc, &src->base_oloc); | ||
369 | ceph_oid_copy(&dest->target_oid, &src->target_oid); | ||
370 | ceph_oloc_copy(&dest->target_oloc, &src->target_oloc); | ||
371 | |||
372 | dest->pgid = src->pgid; /* struct */ | ||
373 | dest->pg_num = src->pg_num; | ||
374 | dest->pg_num_mask = src->pg_num_mask; | ||
375 | ceph_osds_copy(&dest->acting, &src->acting); | ||
376 | ceph_osds_copy(&dest->up, &src->up); | ||
377 | dest->size = src->size; | ||
378 | dest->min_size = src->min_size; | ||
379 | dest->sort_bitwise = src->sort_bitwise; | ||
380 | |||
381 | dest->flags = src->flags; | ||
382 | dest->paused = src->paused; | ||
383 | |||
384 | dest->osd = src->osd; | ||
385 | } | ||
386 | |||
387 | static void target_destroy(struct ceph_osd_request_target *t) | ||
388 | { | ||
389 | ceph_oid_destroy(&t->base_oid); | ||
390 | ceph_oid_destroy(&t->target_oid); | ||
391 | } | ||
392 | |||
393 | /* | ||
311 | * requests | 394 | * requests |
312 | */ | 395 | */ |
396 | static void request_release_checks(struct ceph_osd_request *req) | ||
397 | { | ||
398 | WARN_ON(!RB_EMPTY_NODE(&req->r_node)); | ||
399 | WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node)); | ||
400 | WARN_ON(!list_empty(&req->r_unsafe_item)); | ||
401 | WARN_ON(req->r_osd); | ||
402 | } | ||
403 | |||
313 | static void ceph_osdc_release_request(struct kref *kref) | 404 | static void ceph_osdc_release_request(struct kref *kref) |
314 | { | 405 | { |
315 | struct ceph_osd_request *req = container_of(kref, | 406 | struct ceph_osd_request *req = container_of(kref, |
@@ -318,24 +409,19 @@ static void ceph_osdc_release_request(struct kref *kref) | |||
318 | 409 | ||
319 | dout("%s %p (r_request %p r_reply %p)\n", __func__, req, | 410 | dout("%s %p (r_request %p r_reply %p)\n", __func__, req, |
320 | req->r_request, req->r_reply); | 411 | req->r_request, req->r_reply); |
321 | WARN_ON(!RB_EMPTY_NODE(&req->r_node)); | 412 | request_release_checks(req); |
322 | WARN_ON(!list_empty(&req->r_req_lru_item)); | ||
323 | WARN_ON(!list_empty(&req->r_osd_item)); | ||
324 | WARN_ON(!list_empty(&req->r_linger_item)); | ||
325 | WARN_ON(!list_empty(&req->r_linger_osd_item)); | ||
326 | WARN_ON(req->r_osd); | ||
327 | 413 | ||
328 | if (req->r_request) | 414 | if (req->r_request) |
329 | ceph_msg_put(req->r_request); | 415 | ceph_msg_put(req->r_request); |
330 | if (req->r_reply) { | 416 | if (req->r_reply) |
331 | ceph_msg_revoke_incoming(req->r_reply); | ||
332 | ceph_msg_put(req->r_reply); | 417 | ceph_msg_put(req->r_reply); |
333 | } | ||
334 | 418 | ||
335 | for (which = 0; which < req->r_num_ops; which++) | 419 | for (which = 0; which < req->r_num_ops; which++) |
336 | osd_req_op_data_release(req, which); | 420 | osd_req_op_data_release(req, which); |
337 | 421 | ||
422 | target_destroy(&req->r_t); | ||
338 | ceph_put_snap_context(req->r_snapc); | 423 | ceph_put_snap_context(req->r_snapc); |
424 | |||
339 | if (req->r_mempool) | 425 | if (req->r_mempool) |
340 | mempool_free(req, req->r_osdc->req_mempool); | 426 | mempool_free(req, req->r_osdc->req_mempool); |
341 | else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS) | 427 | else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS) |
@@ -354,12 +440,66 @@ EXPORT_SYMBOL(ceph_osdc_get_request); | |||
354 | 440 | ||
355 | void ceph_osdc_put_request(struct ceph_osd_request *req) | 441 | void ceph_osdc_put_request(struct ceph_osd_request *req) |
356 | { | 442 | { |
357 | dout("%s %p (was %d)\n", __func__, req, | 443 | if (req) { |
358 | atomic_read(&req->r_kref.refcount)); | 444 | dout("%s %p (was %d)\n", __func__, req, |
359 | kref_put(&req->r_kref, ceph_osdc_release_request); | 445 | atomic_read(&req->r_kref.refcount)); |
446 | kref_put(&req->r_kref, ceph_osdc_release_request); | ||
447 | } | ||
360 | } | 448 | } |
361 | EXPORT_SYMBOL(ceph_osdc_put_request); | 449 | EXPORT_SYMBOL(ceph_osdc_put_request); |
362 | 450 | ||
451 | static void request_init(struct ceph_osd_request *req) | ||
452 | { | ||
453 | /* req only, each op is zeroed in _osd_req_op_init() */ | ||
454 | memset(req, 0, sizeof(*req)); | ||
455 | |||
456 | kref_init(&req->r_kref); | ||
457 | init_completion(&req->r_completion); | ||
458 | init_completion(&req->r_safe_completion); | ||
459 | RB_CLEAR_NODE(&req->r_node); | ||
460 | RB_CLEAR_NODE(&req->r_mc_node); | ||
461 | INIT_LIST_HEAD(&req->r_unsafe_item); | ||
462 | |||
463 | target_init(&req->r_t); | ||
464 | } | ||
465 | |||
466 | /* | ||
467 | * This is ugly, but it allows us to reuse linger registration and ping | ||
468 | * requests, keeping the structure of the code around send_linger{_ping}() | ||
469 | * reasonable. Setting up a min_nr=2 mempool for each linger request | ||
470 | * and dealing with copying ops (this blasts req only, watch op remains | ||
471 | * intact) isn't any better. | ||
472 | */ | ||
473 | static void request_reinit(struct ceph_osd_request *req) | ||
474 | { | ||
475 | struct ceph_osd_client *osdc = req->r_osdc; | ||
476 | bool mempool = req->r_mempool; | ||
477 | unsigned int num_ops = req->r_num_ops; | ||
478 | u64 snapid = req->r_snapid; | ||
479 | struct ceph_snap_context *snapc = req->r_snapc; | ||
480 | bool linger = req->r_linger; | ||
481 | struct ceph_msg *request_msg = req->r_request; | ||
482 | struct ceph_msg *reply_msg = req->r_reply; | ||
483 | |||
484 | dout("%s req %p\n", __func__, req); | ||
485 | WARN_ON(atomic_read(&req->r_kref.refcount) != 1); | ||
486 | request_release_checks(req); | ||
487 | |||
488 | WARN_ON(atomic_read(&request_msg->kref.refcount) != 1); | ||
489 | WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1); | ||
490 | target_destroy(&req->r_t); | ||
491 | |||
492 | request_init(req); | ||
493 | req->r_osdc = osdc; | ||
494 | req->r_mempool = mempool; | ||
495 | req->r_num_ops = num_ops; | ||
496 | req->r_snapid = snapid; | ||
497 | req->r_snapc = snapc; | ||
498 | req->r_linger = linger; | ||
499 | req->r_request = request_msg; | ||
500 | req->r_reply = reply_msg; | ||
501 | } | ||
502 | |||
363 | struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | 503 | struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, |
364 | struct ceph_snap_context *snapc, | 504 | struct ceph_snap_context *snapc, |
365 | unsigned int num_ops, | 505 | unsigned int num_ops, |
@@ -367,8 +507,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | |||
367 | gfp_t gfp_flags) | 507 | gfp_t gfp_flags) |
368 | { | 508 | { |
369 | struct ceph_osd_request *req; | 509 | struct ceph_osd_request *req; |
370 | struct ceph_msg *msg; | ||
371 | size_t msg_size; | ||
372 | 510 | ||
373 | if (use_mempool) { | 511 | if (use_mempool) { |
374 | BUG_ON(num_ops > CEPH_OSD_SLAB_OPS); | 512 | BUG_ON(num_ops > CEPH_OSD_SLAB_OPS); |
@@ -383,73 +521,65 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | |||
383 | if (unlikely(!req)) | 521 | if (unlikely(!req)) |
384 | return NULL; | 522 | return NULL; |
385 | 523 | ||
386 | /* req only, each op is zeroed in _osd_req_op_init() */ | 524 | request_init(req); |
387 | memset(req, 0, sizeof(*req)); | ||
388 | |||
389 | req->r_osdc = osdc; | 525 | req->r_osdc = osdc; |
390 | req->r_mempool = use_mempool; | 526 | req->r_mempool = use_mempool; |
391 | req->r_num_ops = num_ops; | 527 | req->r_num_ops = num_ops; |
528 | req->r_snapid = CEPH_NOSNAP; | ||
529 | req->r_snapc = ceph_get_snap_context(snapc); | ||
392 | 530 | ||
393 | kref_init(&req->r_kref); | 531 | dout("%s req %p\n", __func__, req); |
394 | init_completion(&req->r_completion); | 532 | return req; |
395 | init_completion(&req->r_safe_completion); | 533 | } |
396 | RB_CLEAR_NODE(&req->r_node); | 534 | EXPORT_SYMBOL(ceph_osdc_alloc_request); |
397 | INIT_LIST_HEAD(&req->r_unsafe_item); | ||
398 | INIT_LIST_HEAD(&req->r_linger_item); | ||
399 | INIT_LIST_HEAD(&req->r_linger_osd_item); | ||
400 | INIT_LIST_HEAD(&req->r_req_lru_item); | ||
401 | INIT_LIST_HEAD(&req->r_osd_item); | ||
402 | |||
403 | req->r_base_oloc.pool = -1; | ||
404 | req->r_target_oloc.pool = -1; | ||
405 | 535 | ||
406 | msg_size = OSD_OPREPLY_FRONT_LEN; | 536 | int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) |
407 | if (num_ops > CEPH_OSD_SLAB_OPS) { | 537 | { |
408 | /* ceph_osd_op and rval */ | 538 | struct ceph_osd_client *osdc = req->r_osdc; |
409 | msg_size += (num_ops - CEPH_OSD_SLAB_OPS) * | 539 | struct ceph_msg *msg; |
410 | (sizeof(struct ceph_osd_op) + 4); | 540 | int msg_size; |
411 | } | ||
412 | 541 | ||
413 | /* create reply message */ | 542 | WARN_ON(ceph_oid_empty(&req->r_base_oid)); |
414 | if (use_mempool) | ||
415 | msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); | ||
416 | else | ||
417 | msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, | ||
418 | gfp_flags, true); | ||
419 | if (!msg) { | ||
420 | ceph_osdc_put_request(req); | ||
421 | return NULL; | ||
422 | } | ||
423 | req->r_reply = msg; | ||
424 | 543 | ||
544 | /* create request message */ | ||
425 | msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ | 545 | msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ |
426 | msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ | 546 | msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ |
427 | msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ | 547 | msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ |
428 | msg_size += 1 + 8 + 4 + 4; /* pgid */ | 548 | msg_size += 1 + 8 + 4 + 4; /* pgid */ |
429 | msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ | 549 | msg_size += 4 + req->r_base_oid.name_len; /* oid */ |
430 | msg_size += 2 + num_ops * sizeof(struct ceph_osd_op); | 550 | msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op); |
431 | msg_size += 8; /* snapid */ | 551 | msg_size += 8; /* snapid */ |
432 | msg_size += 8; /* snap_seq */ | 552 | msg_size += 8; /* snap_seq */ |
433 | msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ | 553 | msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0); |
434 | msg_size += 4; /* retry_attempt */ | 554 | msg_size += 4; /* retry_attempt */ |
435 | 555 | ||
436 | /* create request message; allow space for oid */ | 556 | if (req->r_mempool) |
437 | if (use_mempool) | ||
438 | msg = ceph_msgpool_get(&osdc->msgpool_op, 0); | 557 | msg = ceph_msgpool_get(&osdc->msgpool_op, 0); |
439 | else | 558 | else |
440 | msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true); | 559 | msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true); |
441 | if (!msg) { | 560 | if (!msg) |
442 | ceph_osdc_put_request(req); | 561 | return -ENOMEM; |
443 | return NULL; | ||
444 | } | ||
445 | 562 | ||
446 | memset(msg->front.iov_base, 0, msg->front.iov_len); | 563 | memset(msg->front.iov_base, 0, msg->front.iov_len); |
447 | |||
448 | req->r_request = msg; | 564 | req->r_request = msg; |
449 | 565 | ||
450 | return req; | 566 | /* create reply message */ |
567 | msg_size = OSD_OPREPLY_FRONT_LEN; | ||
568 | msg_size += req->r_base_oid.name_len; | ||
569 | msg_size += req->r_num_ops * sizeof(struct ceph_osd_op); | ||
570 | |||
571 | if (req->r_mempool) | ||
572 | msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); | ||
573 | else | ||
574 | msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true); | ||
575 | if (!msg) | ||
576 | return -ENOMEM; | ||
577 | |||
578 | req->r_reply = msg; | ||
579 | |||
580 | return 0; | ||
451 | } | 581 | } |
452 | EXPORT_SYMBOL(ceph_osdc_alloc_request); | 582 | EXPORT_SYMBOL(ceph_osdc_alloc_messages); |
453 | 583 | ||
454 | static bool osd_req_opcode_valid(u16 opcode) | 584 | static bool osd_req_opcode_valid(u16 opcode) |
455 | { | 585 | { |
@@ -587,8 +717,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, | |||
587 | 717 | ||
588 | osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); | 718 | osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); |
589 | 719 | ||
590 | op->cls.argc = 0; /* currently unused */ | ||
591 | |||
592 | op->indata_len = payload_len; | 720 | op->indata_len = payload_len; |
593 | } | 721 | } |
594 | EXPORT_SYMBOL(osd_req_op_cls_init); | 722 | EXPORT_SYMBOL(osd_req_op_cls_init); |
@@ -627,21 +755,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, | |||
627 | } | 755 | } |
628 | EXPORT_SYMBOL(osd_req_op_xattr_init); | 756 | EXPORT_SYMBOL(osd_req_op_xattr_init); |
629 | 757 | ||
630 | void osd_req_op_watch_init(struct ceph_osd_request *osd_req, | 758 | /* |
631 | unsigned int which, u16 opcode, | 759 | * @watch_opcode: CEPH_OSD_WATCH_OP_* |
632 | u64 cookie, u64 version, int flag) | 760 | */ |
761 | static void osd_req_op_watch_init(struct ceph_osd_request *req, int which, | ||
762 | u64 cookie, u8 watch_opcode) | ||
633 | { | 763 | { |
634 | struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, | 764 | struct ceph_osd_req_op *op; |
635 | opcode, 0); | ||
636 | |||
637 | BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); | ||
638 | 765 | ||
766 | op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0); | ||
639 | op->watch.cookie = cookie; | 767 | op->watch.cookie = cookie; |
640 | op->watch.ver = version; | 768 | op->watch.op = watch_opcode; |
641 | if (opcode == CEPH_OSD_OP_WATCH && flag) | 769 | op->watch.gen = 0; |
642 | op->watch.flag = (u8)1; | ||
643 | } | 770 | } |
644 | EXPORT_SYMBOL(osd_req_op_watch_init); | ||
645 | 771 | ||
646 | void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, | 772 | void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, |
647 | unsigned int which, | 773 | unsigned int which, |
@@ -686,16 +812,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg, | |||
686 | } | 812 | } |
687 | } | 813 | } |
688 | 814 | ||
689 | static u64 osd_req_encode_op(struct ceph_osd_request *req, | 815 | static u32 osd_req_encode_op(struct ceph_osd_op *dst, |
690 | struct ceph_osd_op *dst, unsigned int which) | 816 | const struct ceph_osd_req_op *src) |
691 | { | 817 | { |
692 | struct ceph_osd_req_op *src; | ||
693 | struct ceph_osd_data *osd_data; | ||
694 | u64 request_data_len = 0; | ||
695 | u64 data_length; | ||
696 | |||
697 | BUG_ON(which >= req->r_num_ops); | ||
698 | src = &req->r_ops[which]; | ||
699 | if (WARN_ON(!osd_req_opcode_valid(src->op))) { | 818 | if (WARN_ON(!osd_req_opcode_valid(src->op))) { |
700 | pr_err("unrecognized osd opcode %d\n", src->op); | 819 | pr_err("unrecognized osd opcode %d\n", src->op); |
701 | 820 | ||
@@ -704,57 +823,36 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, | |||
704 | 823 | ||
705 | switch (src->op) { | 824 | switch (src->op) { |
706 | case CEPH_OSD_OP_STAT: | 825 | case CEPH_OSD_OP_STAT: |
707 | osd_data = &src->raw_data_in; | ||
708 | ceph_osdc_msg_data_add(req->r_reply, osd_data); | ||
709 | break; | 826 | break; |
710 | case CEPH_OSD_OP_READ: | 827 | case CEPH_OSD_OP_READ: |
711 | case CEPH_OSD_OP_WRITE: | 828 | case CEPH_OSD_OP_WRITE: |
712 | case CEPH_OSD_OP_WRITEFULL: | 829 | case CEPH_OSD_OP_WRITEFULL: |
713 | case CEPH_OSD_OP_ZERO: | 830 | case CEPH_OSD_OP_ZERO: |
714 | case CEPH_OSD_OP_TRUNCATE: | 831 | case CEPH_OSD_OP_TRUNCATE: |
715 | if (src->op == CEPH_OSD_OP_WRITE || | ||
716 | src->op == CEPH_OSD_OP_WRITEFULL) | ||
717 | request_data_len = src->extent.length; | ||
718 | dst->extent.offset = cpu_to_le64(src->extent.offset); | 832 | dst->extent.offset = cpu_to_le64(src->extent.offset); |
719 | dst->extent.length = cpu_to_le64(src->extent.length); | 833 | dst->extent.length = cpu_to_le64(src->extent.length); |
720 | dst->extent.truncate_size = | 834 | dst->extent.truncate_size = |
721 | cpu_to_le64(src->extent.truncate_size); | 835 | cpu_to_le64(src->extent.truncate_size); |
722 | dst->extent.truncate_seq = | 836 | dst->extent.truncate_seq = |
723 | cpu_to_le32(src->extent.truncate_seq); | 837 | cpu_to_le32(src->extent.truncate_seq); |
724 | osd_data = &src->extent.osd_data; | ||
725 | if (src->op == CEPH_OSD_OP_WRITE || | ||
726 | src->op == CEPH_OSD_OP_WRITEFULL) | ||
727 | ceph_osdc_msg_data_add(req->r_request, osd_data); | ||
728 | else | ||
729 | ceph_osdc_msg_data_add(req->r_reply, osd_data); | ||
730 | break; | 838 | break; |
731 | case CEPH_OSD_OP_CALL: | 839 | case CEPH_OSD_OP_CALL: |
732 | dst->cls.class_len = src->cls.class_len; | 840 | dst->cls.class_len = src->cls.class_len; |
733 | dst->cls.method_len = src->cls.method_len; | 841 | dst->cls.method_len = src->cls.method_len; |
734 | osd_data = &src->cls.request_info; | 842 | dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); |
735 | ceph_osdc_msg_data_add(req->r_request, osd_data); | ||
736 | BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST); | ||
737 | request_data_len = osd_data->pagelist->length; | ||
738 | |||
739 | osd_data = &src->cls.request_data; | ||
740 | data_length = ceph_osd_data_length(osd_data); | ||
741 | if (data_length) { | ||
742 | BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); | ||
743 | dst->cls.indata_len = cpu_to_le32(data_length); | ||
744 | ceph_osdc_msg_data_add(req->r_request, osd_data); | ||
745 | src->indata_len += data_length; | ||
746 | request_data_len += data_length; | ||
747 | } | ||
748 | osd_data = &src->cls.response_data; | ||
749 | ceph_osdc_msg_data_add(req->r_reply, osd_data); | ||
750 | break; | 843 | break; |
751 | case CEPH_OSD_OP_STARTSYNC: | 844 | case CEPH_OSD_OP_STARTSYNC: |
752 | break; | 845 | break; |
753 | case CEPH_OSD_OP_NOTIFY_ACK: | ||
754 | case CEPH_OSD_OP_WATCH: | 846 | case CEPH_OSD_OP_WATCH: |
755 | dst->watch.cookie = cpu_to_le64(src->watch.cookie); | 847 | dst->watch.cookie = cpu_to_le64(src->watch.cookie); |
756 | dst->watch.ver = cpu_to_le64(src->watch.ver); | 848 | dst->watch.ver = cpu_to_le64(0); |
757 | dst->watch.flag = src->watch.flag; | 849 | dst->watch.op = src->watch.op; |
850 | dst->watch.gen = cpu_to_le32(src->watch.gen); | ||
851 | break; | ||
852 | case CEPH_OSD_OP_NOTIFY_ACK: | ||
853 | break; | ||
854 | case CEPH_OSD_OP_NOTIFY: | ||
855 | dst->notify.cookie = cpu_to_le64(src->notify.cookie); | ||
758 | break; | 856 | break; |
759 | case CEPH_OSD_OP_SETALLOCHINT: | 857 | case CEPH_OSD_OP_SETALLOCHINT: |
760 | dst->alloc_hint.expected_object_size = | 858 | dst->alloc_hint.expected_object_size = |
@@ -768,9 +866,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, | |||
768 | dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); | 866 | dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); |
769 | dst->xattr.cmp_op = src->xattr.cmp_op; | 867 | dst->xattr.cmp_op = src->xattr.cmp_op; |
770 | dst->xattr.cmp_mode = src->xattr.cmp_mode; | 868 | dst->xattr.cmp_mode = src->xattr.cmp_mode; |
771 | osd_data = &src->xattr.osd_data; | ||
772 | ceph_osdc_msg_data_add(req->r_request, osd_data); | ||
773 | request_data_len = osd_data->pagelist->length; | ||
774 | break; | 869 | break; |
775 | case CEPH_OSD_OP_CREATE: | 870 | case CEPH_OSD_OP_CREATE: |
776 | case CEPH_OSD_OP_DELETE: | 871 | case CEPH_OSD_OP_DELETE: |
@@ -787,7 +882,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, | |||
787 | dst->flags = cpu_to_le32(src->flags); | 882 | dst->flags = cpu_to_le32(src->flags); |
788 | dst->payload_len = cpu_to_le32(src->indata_len); | 883 | dst->payload_len = cpu_to_le32(src->indata_len); |
789 | 884 | ||
790 | return request_data_len; | 885 | return src->indata_len; |
791 | } | 886 | } |
792 | 887 | ||
793 | /* | 888 | /* |
@@ -824,17 +919,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, | |||
824 | 919 | ||
825 | req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, | 920 | req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, |
826 | GFP_NOFS); | 921 | GFP_NOFS); |
827 | if (!req) | 922 | if (!req) { |
828 | return ERR_PTR(-ENOMEM); | 923 | r = -ENOMEM; |
829 | 924 | goto fail; | |
830 | req->r_flags = flags; | 925 | } |
831 | 926 | ||
832 | /* calculate max write size */ | 927 | /* calculate max write size */ |
833 | r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen); | 928 | r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen); |
834 | if (r < 0) { | 929 | if (r) |
835 | ceph_osdc_put_request(req); | 930 | goto fail; |
836 | return ERR_PTR(r); | ||
837 | } | ||
838 | 931 | ||
839 | if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { | 932 | if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { |
840 | osd_req_op_init(req, which, opcode, 0); | 933 | osd_req_op_init(req, which, opcode, 0); |
@@ -854,194 +947,71 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, | |||
854 | truncate_size, truncate_seq); | 947 | truncate_size, truncate_seq); |
855 | } | 948 | } |
856 | 949 | ||
950 | req->r_flags = flags; | ||
857 | req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); | 951 | req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); |
952 | ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum); | ||
858 | 953 | ||
859 | snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name), | 954 | req->r_snapid = vino.snap; |
860 | "%llx.%08llx", vino.ino, objnum); | 955 | if (flags & CEPH_OSD_FLAG_WRITE) |
861 | req->r_base_oid.name_len = strlen(req->r_base_oid.name); | 956 | req->r_data_offset = off; |
957 | |||
958 | r = ceph_osdc_alloc_messages(req, GFP_NOFS); | ||
959 | if (r) | ||
960 | goto fail; | ||
862 | 961 | ||
863 | return req; | 962 | return req; |
963 | |||
964 | fail: | ||
965 | ceph_osdc_put_request(req); | ||
966 | return ERR_PTR(r); | ||
864 | } | 967 | } |
865 | EXPORT_SYMBOL(ceph_osdc_new_request); | 968 | EXPORT_SYMBOL(ceph_osdc_new_request); |
866 | 969 | ||
867 | /* | 970 | /* |
868 | * We keep osd requests in an rbtree, sorted by ->r_tid. | 971 | * We keep osd requests in an rbtree, sorted by ->r_tid. |
869 | */ | 972 | */ |
870 | static void __insert_request(struct ceph_osd_client *osdc, | 973 | DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node) |
871 | struct ceph_osd_request *new) | 974 | DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node) |
872 | { | ||
873 | struct rb_node **p = &osdc->requests.rb_node; | ||
874 | struct rb_node *parent = NULL; | ||
875 | struct ceph_osd_request *req = NULL; | ||
876 | |||
877 | while (*p) { | ||
878 | parent = *p; | ||
879 | req = rb_entry(parent, struct ceph_osd_request, r_node); | ||
880 | if (new->r_tid < req->r_tid) | ||
881 | p = &(*p)->rb_left; | ||
882 | else if (new->r_tid > req->r_tid) | ||
883 | p = &(*p)->rb_right; | ||
884 | else | ||
885 | BUG(); | ||
886 | } | ||
887 | |||
888 | rb_link_node(&new->r_node, parent, p); | ||
889 | rb_insert_color(&new->r_node, &osdc->requests); | ||
890 | } | ||
891 | |||
892 | static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc, | ||
893 | u64 tid) | ||
894 | { | ||
895 | struct ceph_osd_request *req; | ||
896 | struct rb_node *n = osdc->requests.rb_node; | ||
897 | |||
898 | while (n) { | ||
899 | req = rb_entry(n, struct ceph_osd_request, r_node); | ||
900 | if (tid < req->r_tid) | ||
901 | n = n->rb_left; | ||
902 | else if (tid > req->r_tid) | ||
903 | n = n->rb_right; | ||
904 | else | ||
905 | return req; | ||
906 | } | ||
907 | return NULL; | ||
908 | } | ||
909 | 975 | ||
910 | static struct ceph_osd_request * | 976 | static bool osd_homeless(struct ceph_osd *osd) |
911 | __lookup_request_ge(struct ceph_osd_client *osdc, | ||
912 | u64 tid) | ||
913 | { | 977 | { |
914 | struct ceph_osd_request *req; | 978 | return osd->o_osd == CEPH_HOMELESS_OSD; |
915 | struct rb_node *n = osdc->requests.rb_node; | ||
916 | |||
917 | while (n) { | ||
918 | req = rb_entry(n, struct ceph_osd_request, r_node); | ||
919 | if (tid < req->r_tid) { | ||
920 | if (!n->rb_left) | ||
921 | return req; | ||
922 | n = n->rb_left; | ||
923 | } else if (tid > req->r_tid) { | ||
924 | n = n->rb_right; | ||
925 | } else { | ||
926 | return req; | ||
927 | } | ||
928 | } | ||
929 | return NULL; | ||
930 | } | 979 | } |
931 | 980 | ||
932 | static void __kick_linger_request(struct ceph_osd_request *req) | 981 | static bool osd_registered(struct ceph_osd *osd) |
933 | { | 982 | { |
934 | struct ceph_osd_client *osdc = req->r_osdc; | 983 | verify_osdc_locked(osd->o_osdc); |
935 | struct ceph_osd *osd = req->r_osd; | ||
936 | |||
937 | /* | ||
938 | * Linger requests need to be resent with a new tid to avoid | ||
939 | * the dup op detection logic on the OSDs. Achieve this with | ||
940 | * a re-register dance instead of open-coding. | ||
941 | */ | ||
942 | ceph_osdc_get_request(req); | ||
943 | if (!list_empty(&req->r_linger_item)) | ||
944 | __unregister_linger_request(osdc, req); | ||
945 | else | ||
946 | __unregister_request(osdc, req); | ||
947 | __register_request(osdc, req); | ||
948 | ceph_osdc_put_request(req); | ||
949 | |||
950 | /* | ||
951 | * Unless request has been registered as both normal and | ||
952 | * lingering, __unregister{,_linger}_request clears r_osd. | ||
953 | * However, here we need to preserve r_osd to make sure we | ||
954 | * requeue on the same OSD. | ||
955 | */ | ||
956 | WARN_ON(req->r_osd || !osd); | ||
957 | req->r_osd = osd; | ||
958 | 984 | ||
959 | dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid); | 985 | return !RB_EMPTY_NODE(&osd->o_node); |
960 | __enqueue_request(req); | ||
961 | } | 986 | } |
962 | 987 | ||
963 | /* | 988 | /* |
964 | * Resubmit requests pending on the given osd. | 989 | * Assumes @osd is zero-initialized. |
965 | */ | 990 | */ |
966 | static void __kick_osd_requests(struct ceph_osd_client *osdc, | 991 | static void osd_init(struct ceph_osd *osd) |
967 | struct ceph_osd *osd) | ||
968 | { | 992 | { |
969 | struct ceph_osd_request *req, *nreq; | 993 | atomic_set(&osd->o_ref, 1); |
970 | LIST_HEAD(resend); | 994 | RB_CLEAR_NODE(&osd->o_node); |
971 | LIST_HEAD(resend_linger); | 995 | osd->o_requests = RB_ROOT; |
972 | int err; | 996 | osd->o_linger_requests = RB_ROOT; |
973 | 997 | INIT_LIST_HEAD(&osd->o_osd_lru); | |
974 | dout("%s osd%d\n", __func__, osd->o_osd); | 998 | INIT_LIST_HEAD(&osd->o_keepalive_item); |
975 | err = __reset_osd(osdc, osd); | 999 | osd->o_incarnation = 1; |
976 | if (err) | 1000 | mutex_init(&osd->lock); |
977 | return; | ||
978 | |||
979 | /* | ||
980 | * Build up a list of requests to resend by traversing the | ||
981 | * osd's list of requests. Requests for a given object are | ||
982 | * sent in tid order, and that is also the order they're | ||
983 | * kept on this list. Therefore all requests that are in | ||
984 | * flight will be found first, followed by all requests that | ||
985 | * have not yet been sent. And to resend requests while | ||
986 | * preserving this order we will want to put any sent | ||
987 | * requests back on the front of the osd client's unsent | ||
988 | * list. | ||
989 | * | ||
990 | * So we build a separate ordered list of already-sent | ||
991 | * requests for the affected osd and splice it onto the | ||
992 | * front of the osd client's unsent list. Once we've seen a | ||
993 | * request that has not yet been sent we're done. Those | ||
994 | * requests are already sitting right where they belong. | ||
995 | */ | ||
996 | list_for_each_entry(req, &osd->o_requests, r_osd_item) { | ||
997 | if (!req->r_sent) | ||
998 | break; | ||
999 | |||
1000 | if (!req->r_linger) { | ||
1001 | dout("%s requeueing %p tid %llu\n", __func__, req, | ||
1002 | req->r_tid); | ||
1003 | list_move_tail(&req->r_req_lru_item, &resend); | ||
1004 | req->r_flags |= CEPH_OSD_FLAG_RETRY; | ||
1005 | } else { | ||
1006 | list_move_tail(&req->r_req_lru_item, &resend_linger); | ||
1007 | } | ||
1008 | } | ||
1009 | list_splice(&resend, &osdc->req_unsent); | ||
1010 | |||
1011 | /* | ||
1012 | * Both registered and not yet registered linger requests are | ||
1013 | * enqueued with a new tid on the same OSD. We add/move them | ||
1014 | * to req_unsent/o_requests at the end to keep things in tid | ||
1015 | * order. | ||
1016 | */ | ||
1017 | list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, | ||
1018 | r_linger_osd_item) { | ||
1019 | WARN_ON(!list_empty(&req->r_req_lru_item)); | ||
1020 | __kick_linger_request(req); | ||
1021 | } | ||
1022 | |||
1023 | list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item) | ||
1024 | __kick_linger_request(req); | ||
1025 | } | 1001 | } |
1026 | 1002 | ||
1027 | /* | 1003 | static void osd_cleanup(struct ceph_osd *osd) |
1028 | * If the osd connection drops, we need to resubmit all requests. | ||
1029 | */ | ||
1030 | static void osd_reset(struct ceph_connection *con) | ||
1031 | { | 1004 | { |
1032 | struct ceph_osd *osd = con->private; | 1005 | WARN_ON(!RB_EMPTY_NODE(&osd->o_node)); |
1033 | struct ceph_osd_client *osdc; | 1006 | WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); |
1034 | 1007 | WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests)); | |
1035 | if (!osd) | 1008 | WARN_ON(!list_empty(&osd->o_osd_lru)); |
1036 | return; | 1009 | WARN_ON(!list_empty(&osd->o_keepalive_item)); |
1037 | dout("osd_reset osd%d\n", osd->o_osd); | 1010 | |
1038 | osdc = osd->o_osdc; | 1011 | if (osd->o_auth.authorizer) { |
1039 | down_read(&osdc->map_sem); | 1012 | WARN_ON(osd_homeless(osd)); |
1040 | mutex_lock(&osdc->request_mutex); | 1013 | ceph_auth_destroy_authorizer(osd->o_auth.authorizer); |
1041 | __kick_osd_requests(osdc, osd); | 1014 | } |
1042 | __send_queued(osdc); | ||
1043 | mutex_unlock(&osdc->request_mutex); | ||
1044 | up_read(&osdc->map_sem); | ||
1045 | } | 1015 | } |
1046 | 1016 | ||
1047 | /* | 1017 | /* |
@@ -1051,22 +1021,15 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) | |||
1051 | { | 1021 | { |
1052 | struct ceph_osd *osd; | 1022 | struct ceph_osd *osd; |
1053 | 1023 | ||
1054 | osd = kzalloc(sizeof(*osd), GFP_NOFS); | 1024 | WARN_ON(onum == CEPH_HOMELESS_OSD); |
1055 | if (!osd) | ||
1056 | return NULL; | ||
1057 | 1025 | ||
1058 | atomic_set(&osd->o_ref, 1); | 1026 | osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL); |
1027 | osd_init(osd); | ||
1059 | osd->o_osdc = osdc; | 1028 | osd->o_osdc = osdc; |
1060 | osd->o_osd = onum; | 1029 | osd->o_osd = onum; |
1061 | RB_CLEAR_NODE(&osd->o_node); | ||
1062 | INIT_LIST_HEAD(&osd->o_requests); | ||
1063 | INIT_LIST_HEAD(&osd->o_linger_requests); | ||
1064 | INIT_LIST_HEAD(&osd->o_osd_lru); | ||
1065 | osd->o_incarnation = 1; | ||
1066 | 1030 | ||
1067 | ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); | 1031 | ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); |
1068 | 1032 | ||
1069 | INIT_LIST_HEAD(&osd->o_keepalive_item); | ||
1070 | return osd; | 1033 | return osd; |
1071 | } | 1034 | } |
1072 | 1035 | ||
@@ -1087,114 +1050,115 @@ static void put_osd(struct ceph_osd *osd) | |||
1087 | dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), | 1050 | dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), |
1088 | atomic_read(&osd->o_ref) - 1); | 1051 | atomic_read(&osd->o_ref) - 1); |
1089 | if (atomic_dec_and_test(&osd->o_ref)) { | 1052 | if (atomic_dec_and_test(&osd->o_ref)) { |
1090 | if (osd->o_auth.authorizer) | 1053 | osd_cleanup(osd); |
1091 | ceph_auth_destroy_authorizer(osd->o_auth.authorizer); | ||
1092 | kfree(osd); | 1054 | kfree(osd); |
1093 | } | 1055 | } |
1094 | } | 1056 | } |
1095 | 1057 | ||
1096 | /* | 1058 | DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node) |
1097 | * remove an osd from our map | ||
1098 | */ | ||
1099 | static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) | ||
1100 | { | ||
1101 | dout("%s %p osd%d\n", __func__, osd, osd->o_osd); | ||
1102 | WARN_ON(!list_empty(&osd->o_requests)); | ||
1103 | WARN_ON(!list_empty(&osd->o_linger_requests)); | ||
1104 | 1059 | ||
1105 | list_del_init(&osd->o_osd_lru); | 1060 | static void __move_osd_to_lru(struct ceph_osd *osd) |
1106 | rb_erase(&osd->o_node, &osdc->osds); | ||
1107 | RB_CLEAR_NODE(&osd->o_node); | ||
1108 | } | ||
1109 | |||
1110 | static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) | ||
1111 | { | 1061 | { |
1112 | dout("%s %p osd%d\n", __func__, osd, osd->o_osd); | 1062 | struct ceph_osd_client *osdc = osd->o_osdc; |
1113 | |||
1114 | if (!RB_EMPTY_NODE(&osd->o_node)) { | ||
1115 | ceph_con_close(&osd->o_con); | ||
1116 | __remove_osd(osdc, osd); | ||
1117 | put_osd(osd); | ||
1118 | } | ||
1119 | } | ||
1120 | |||
1121 | static void remove_all_osds(struct ceph_osd_client *osdc) | ||
1122 | { | ||
1123 | dout("%s %p\n", __func__, osdc); | ||
1124 | mutex_lock(&osdc->request_mutex); | ||
1125 | while (!RB_EMPTY_ROOT(&osdc->osds)) { | ||
1126 | struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), | ||
1127 | struct ceph_osd, o_node); | ||
1128 | remove_osd(osdc, osd); | ||
1129 | } | ||
1130 | mutex_unlock(&osdc->request_mutex); | ||
1131 | } | ||
1132 | 1063 | ||
1133 | static void __move_osd_to_lru(struct ceph_osd_client *osdc, | 1064 | dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); |
1134 | struct ceph_osd *osd) | ||
1135 | { | ||
1136 | dout("%s %p\n", __func__, osd); | ||
1137 | BUG_ON(!list_empty(&osd->o_osd_lru)); | 1065 | BUG_ON(!list_empty(&osd->o_osd_lru)); |
1138 | 1066 | ||
1067 | spin_lock(&osdc->osd_lru_lock); | ||
1139 | list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); | 1068 | list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); |
1069 | spin_unlock(&osdc->osd_lru_lock); | ||
1070 | |||
1140 | osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl; | 1071 | osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl; |
1141 | } | 1072 | } |
1142 | 1073 | ||
1143 | static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc, | 1074 | static void maybe_move_osd_to_lru(struct ceph_osd *osd) |
1144 | struct ceph_osd *osd) | ||
1145 | { | 1075 | { |
1146 | dout("%s %p\n", __func__, osd); | 1076 | if (RB_EMPTY_ROOT(&osd->o_requests) && |
1147 | 1077 | RB_EMPTY_ROOT(&osd->o_linger_requests)) | |
1148 | if (list_empty(&osd->o_requests) && | 1078 | __move_osd_to_lru(osd); |
1149 | list_empty(&osd->o_linger_requests)) | ||
1150 | __move_osd_to_lru(osdc, osd); | ||
1151 | } | 1079 | } |
1152 | 1080 | ||
1153 | static void __remove_osd_from_lru(struct ceph_osd *osd) | 1081 | static void __remove_osd_from_lru(struct ceph_osd *osd) |
1154 | { | 1082 | { |
1155 | dout("__remove_osd_from_lru %p\n", osd); | 1083 | struct ceph_osd_client *osdc = osd->o_osdc; |
1084 | |||
1085 | dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); | ||
1086 | |||
1087 | spin_lock(&osdc->osd_lru_lock); | ||
1156 | if (!list_empty(&osd->o_osd_lru)) | 1088 | if (!list_empty(&osd->o_osd_lru)) |
1157 | list_del_init(&osd->o_osd_lru); | 1089 | list_del_init(&osd->o_osd_lru); |
1090 | spin_unlock(&osdc->osd_lru_lock); | ||
1158 | } | 1091 | } |
1159 | 1092 | ||
1160 | static void remove_old_osds(struct ceph_osd_client *osdc) | 1093 | /* |
1094 | * Close the connection and assign any leftover requests to the | ||
1095 | * homeless session. | ||
1096 | */ | ||
1097 | static void close_osd(struct ceph_osd *osd) | ||
1161 | { | 1098 | { |
1162 | struct ceph_osd *osd, *nosd; | 1099 | struct ceph_osd_client *osdc = osd->o_osdc; |
1100 | struct rb_node *n; | ||
1163 | 1101 | ||
1164 | dout("__remove_old_osds %p\n", osdc); | 1102 | verify_osdc_wrlocked(osdc); |
1165 | mutex_lock(&osdc->request_mutex); | 1103 | dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); |
1166 | list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { | 1104 | |
1167 | if (time_before(jiffies, osd->lru_ttl)) | 1105 | ceph_con_close(&osd->o_con); |
1168 | break; | 1106 | |
1169 | remove_osd(osdc, osd); | 1107 | for (n = rb_first(&osd->o_requests); n; ) { |
1108 | struct ceph_osd_request *req = | ||
1109 | rb_entry(n, struct ceph_osd_request, r_node); | ||
1110 | |||
1111 | n = rb_next(n); /* unlink_request() */ | ||
1112 | |||
1113 | dout(" reassigning req %p tid %llu\n", req, req->r_tid); | ||
1114 | unlink_request(osd, req); | ||
1115 | link_request(&osdc->homeless_osd, req); | ||
1116 | } | ||
1117 | for (n = rb_first(&osd->o_linger_requests); n; ) { | ||
1118 | struct ceph_osd_linger_request *lreq = | ||
1119 | rb_entry(n, struct ceph_osd_linger_request, node); | ||
1120 | |||
1121 | n = rb_next(n); /* unlink_linger() */ | ||
1122 | |||
1123 | dout(" reassigning lreq %p linger_id %llu\n", lreq, | ||
1124 | lreq->linger_id); | ||
1125 | unlink_linger(osd, lreq); | ||
1126 | link_linger(&osdc->homeless_osd, lreq); | ||
1170 | } | 1127 | } |
1171 | mutex_unlock(&osdc->request_mutex); | 1128 | |
1129 | __remove_osd_from_lru(osd); | ||
1130 | erase_osd(&osdc->osds, osd); | ||
1131 | put_osd(osd); | ||
1172 | } | 1132 | } |
1173 | 1133 | ||
1174 | /* | 1134 | /* |
1175 | * reset osd connect | 1135 | * reset osd connect |
1176 | */ | 1136 | */ |
1177 | static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) | 1137 | static int reopen_osd(struct ceph_osd *osd) |
1178 | { | 1138 | { |
1179 | struct ceph_entity_addr *peer_addr; | 1139 | struct ceph_entity_addr *peer_addr; |
1180 | 1140 | ||
1181 | dout("__reset_osd %p osd%d\n", osd, osd->o_osd); | 1141 | dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); |
1182 | if (list_empty(&osd->o_requests) && | 1142 | |
1183 | list_empty(&osd->o_linger_requests)) { | 1143 | if (RB_EMPTY_ROOT(&osd->o_requests) && |
1184 | remove_osd(osdc, osd); | 1144 | RB_EMPTY_ROOT(&osd->o_linger_requests)) { |
1145 | close_osd(osd); | ||
1185 | return -ENODEV; | 1146 | return -ENODEV; |
1186 | } | 1147 | } |
1187 | 1148 | ||
1188 | peer_addr = &osdc->osdmap->osd_addr[osd->o_osd]; | 1149 | peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd]; |
1189 | if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && | 1150 | if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && |
1190 | !ceph_con_opened(&osd->o_con)) { | 1151 | !ceph_con_opened(&osd->o_con)) { |
1191 | struct ceph_osd_request *req; | 1152 | struct rb_node *n; |
1192 | 1153 | ||
1193 | dout("osd addr hasn't changed and connection never opened, " | 1154 | dout("osd addr hasn't changed and connection never opened, " |
1194 | "letting msgr retry\n"); | 1155 | "letting msgr retry\n"); |
1195 | /* touch each r_stamp for handle_timeout()'s benfit */ | 1156 | /* touch each r_stamp for handle_timeout()'s benfit */ |
1196 | list_for_each_entry(req, &osd->o_requests, r_osd_item) | 1157 | for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) { |
1158 | struct ceph_osd_request *req = | ||
1159 | rb_entry(n, struct ceph_osd_request, r_node); | ||
1197 | req->r_stamp = jiffies; | 1160 | req->r_stamp = jiffies; |
1161 | } | ||
1198 | 1162 | ||
1199 | return -EAGAIN; | 1163 | return -EAGAIN; |
1200 | } | 1164 | } |
@@ -1206,455 +1170,1370 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) | |||
1206 | return 0; | 1170 | return 0; |
1207 | } | 1171 | } |
1208 | 1172 | ||
1209 | static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) | 1173 | static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o, |
1174 | bool wrlocked) | ||
1210 | { | 1175 | { |
1211 | struct rb_node **p = &osdc->osds.rb_node; | 1176 | struct ceph_osd *osd; |
1212 | struct rb_node *parent = NULL; | ||
1213 | struct ceph_osd *osd = NULL; | ||
1214 | 1177 | ||
1215 | dout("__insert_osd %p osd%d\n", new, new->o_osd); | 1178 | if (wrlocked) |
1216 | while (*p) { | 1179 | verify_osdc_wrlocked(osdc); |
1217 | parent = *p; | 1180 | else |
1218 | osd = rb_entry(parent, struct ceph_osd, o_node); | 1181 | verify_osdc_locked(osdc); |
1219 | if (new->o_osd < osd->o_osd) | 1182 | |
1220 | p = &(*p)->rb_left; | 1183 | if (o != CEPH_HOMELESS_OSD) |
1221 | else if (new->o_osd > osd->o_osd) | 1184 | osd = lookup_osd(&osdc->osds, o); |
1222 | p = &(*p)->rb_right; | 1185 | else |
1223 | else | 1186 | osd = &osdc->homeless_osd; |
1224 | BUG(); | 1187 | if (!osd) { |
1188 | if (!wrlocked) | ||
1189 | return ERR_PTR(-EAGAIN); | ||
1190 | |||
1191 | osd = create_osd(osdc, o); | ||
1192 | insert_osd(&osdc->osds, osd); | ||
1193 | ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, | ||
1194 | &osdc->osdmap->osd_addr[osd->o_osd]); | ||
1225 | } | 1195 | } |
1226 | 1196 | ||
1227 | rb_link_node(&new->o_node, parent, p); | 1197 | dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd); |
1228 | rb_insert_color(&new->o_node, &osdc->osds); | 1198 | return osd; |
1229 | } | 1199 | } |
1230 | 1200 | ||
1231 | static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) | 1201 | /* |
1202 | * Create request <-> OSD session relation. | ||
1203 | * | ||
1204 | * @req has to be assigned a tid, @osd may be homeless. | ||
1205 | */ | ||
1206 | static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req) | ||
1232 | { | 1207 | { |
1233 | struct ceph_osd *osd; | 1208 | verify_osd_locked(osd); |
1234 | struct rb_node *n = osdc->osds.rb_node; | 1209 | WARN_ON(!req->r_tid || req->r_osd); |
1235 | 1210 | dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd, | |
1236 | while (n) { | 1211 | req, req->r_tid); |
1237 | osd = rb_entry(n, struct ceph_osd, o_node); | 1212 | |
1238 | if (o < osd->o_osd) | 1213 | if (!osd_homeless(osd)) |
1239 | n = n->rb_left; | 1214 | __remove_osd_from_lru(osd); |
1240 | else if (o > osd->o_osd) | 1215 | else |
1241 | n = n->rb_right; | 1216 | atomic_inc(&osd->o_osdc->num_homeless); |
1242 | else | 1217 | |
1243 | return osd; | 1218 | get_osd(osd); |
1244 | } | 1219 | insert_request(&osd->o_requests, req); |
1245 | return NULL; | 1220 | req->r_osd = osd; |
1246 | } | 1221 | } |
1247 | 1222 | ||
1248 | static void __schedule_osd_timeout(struct ceph_osd_client *osdc) | 1223 | static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req) |
1249 | { | 1224 | { |
1250 | schedule_delayed_work(&osdc->timeout_work, | 1225 | verify_osd_locked(osd); |
1251 | osdc->client->options->osd_keepalive_timeout); | 1226 | WARN_ON(req->r_osd != osd); |
1227 | dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd, | ||
1228 | req, req->r_tid); | ||
1229 | |||
1230 | req->r_osd = NULL; | ||
1231 | erase_request(&osd->o_requests, req); | ||
1232 | put_osd(osd); | ||
1233 | |||
1234 | if (!osd_homeless(osd)) | ||
1235 | maybe_move_osd_to_lru(osd); | ||
1236 | else | ||
1237 | atomic_dec(&osd->o_osdc->num_homeless); | ||
1252 | } | 1238 | } |
1253 | 1239 | ||
1254 | static void __cancel_osd_timeout(struct ceph_osd_client *osdc) | 1240 | static bool __pool_full(struct ceph_pg_pool_info *pi) |
1255 | { | 1241 | { |
1256 | cancel_delayed_work(&osdc->timeout_work); | 1242 | return pi->flags & CEPH_POOL_FLAG_FULL; |
1257 | } | 1243 | } |
1258 | 1244 | ||
1259 | /* | 1245 | static bool have_pool_full(struct ceph_osd_client *osdc) |
1260 | * Register request, assign tid. If this is the first request, set up | ||
1261 | * the timeout event. | ||
1262 | */ | ||
1263 | static void __register_request(struct ceph_osd_client *osdc, | ||
1264 | struct ceph_osd_request *req) | ||
1265 | { | 1246 | { |
1266 | req->r_tid = ++osdc->last_tid; | 1247 | struct rb_node *n; |
1267 | req->r_request->hdr.tid = cpu_to_le64(req->r_tid); | 1248 | |
1268 | dout("__register_request %p tid %lld\n", req, req->r_tid); | 1249 | for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) { |
1269 | __insert_request(osdc, req); | 1250 | struct ceph_pg_pool_info *pi = |
1270 | ceph_osdc_get_request(req); | 1251 | rb_entry(n, struct ceph_pg_pool_info, node); |
1271 | osdc->num_requests++; | 1252 | |
1272 | if (osdc->num_requests == 1) { | 1253 | if (__pool_full(pi)) |
1273 | dout(" first request, scheduling timeout\n"); | 1254 | return true; |
1274 | __schedule_osd_timeout(osdc); | ||
1275 | } | 1255 | } |
1256 | |||
1257 | return false; | ||
1258 | } | ||
1259 | |||
1260 | static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id) | ||
1261 | { | ||
1262 | struct ceph_pg_pool_info *pi; | ||
1263 | |||
1264 | pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id); | ||
1265 | if (!pi) | ||
1266 | return false; | ||
1267 | |||
1268 | return __pool_full(pi); | ||
1276 | } | 1269 | } |
1277 | 1270 | ||
1278 | /* | 1271 | /* |
1279 | * called under osdc->request_mutex | 1272 | * Returns whether a request should be blocked from being sent |
1273 | * based on the current osdmap and osd_client settings. | ||
1280 | */ | 1274 | */ |
1281 | static void __unregister_request(struct ceph_osd_client *osdc, | 1275 | static bool target_should_be_paused(struct ceph_osd_client *osdc, |
1282 | struct ceph_osd_request *req) | 1276 | const struct ceph_osd_request_target *t, |
1277 | struct ceph_pg_pool_info *pi) | ||
1283 | { | 1278 | { |
1284 | if (RB_EMPTY_NODE(&req->r_node)) { | 1279 | bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); |
1285 | dout("__unregister_request %p tid %lld not registered\n", | 1280 | bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || |
1286 | req, req->r_tid); | 1281 | ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || |
1287 | return; | 1282 | __pool_full(pi); |
1283 | |||
1284 | WARN_ON(pi->id != t->base_oloc.pool); | ||
1285 | return (t->flags & CEPH_OSD_FLAG_READ && pauserd) || | ||
1286 | (t->flags & CEPH_OSD_FLAG_WRITE && pausewr); | ||
1287 | } | ||
1288 | |||
1289 | enum calc_target_result { | ||
1290 | CALC_TARGET_NO_ACTION = 0, | ||
1291 | CALC_TARGET_NEED_RESEND, | ||
1292 | CALC_TARGET_POOL_DNE, | ||
1293 | }; | ||
1294 | |||
1295 | static enum calc_target_result calc_target(struct ceph_osd_client *osdc, | ||
1296 | struct ceph_osd_request_target *t, | ||
1297 | u32 *last_force_resend, | ||
1298 | bool any_change) | ||
1299 | { | ||
1300 | struct ceph_pg_pool_info *pi; | ||
1301 | struct ceph_pg pgid, last_pgid; | ||
1302 | struct ceph_osds up, acting; | ||
1303 | bool force_resend = false; | ||
1304 | bool need_check_tiering = false; | ||
1305 | bool need_resend = false; | ||
1306 | bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap, | ||
1307 | CEPH_OSDMAP_SORTBITWISE); | ||
1308 | enum calc_target_result ct_res; | ||
1309 | int ret; | ||
1310 | |||
1311 | pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool); | ||
1312 | if (!pi) { | ||
1313 | t->osd = CEPH_HOMELESS_OSD; | ||
1314 | ct_res = CALC_TARGET_POOL_DNE; | ||
1315 | goto out; | ||
1288 | } | 1316 | } |
1289 | 1317 | ||
1290 | dout("__unregister_request %p tid %lld\n", req, req->r_tid); | 1318 | if (osdc->osdmap->epoch == pi->last_force_request_resend) { |
1291 | rb_erase(&req->r_node, &osdc->requests); | 1319 | if (last_force_resend && |
1292 | RB_CLEAR_NODE(&req->r_node); | 1320 | *last_force_resend < pi->last_force_request_resend) { |
1293 | osdc->num_requests--; | 1321 | *last_force_resend = pi->last_force_request_resend; |
1322 | force_resend = true; | ||
1323 | } else if (!last_force_resend) { | ||
1324 | force_resend = true; | ||
1325 | } | ||
1326 | } | ||
1327 | if (ceph_oid_empty(&t->target_oid) || force_resend) { | ||
1328 | ceph_oid_copy(&t->target_oid, &t->base_oid); | ||
1329 | need_check_tiering = true; | ||
1330 | } | ||
1331 | if (ceph_oloc_empty(&t->target_oloc) || force_resend) { | ||
1332 | ceph_oloc_copy(&t->target_oloc, &t->base_oloc); | ||
1333 | need_check_tiering = true; | ||
1334 | } | ||
1294 | 1335 | ||
1295 | if (req->r_osd) { | 1336 | if (need_check_tiering && |
1296 | /* make sure the original request isn't in flight. */ | 1337 | (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { |
1297 | ceph_msg_revoke(req->r_request); | 1338 | if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0) |
1339 | t->target_oloc.pool = pi->read_tier; | ||
1340 | if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0) | ||
1341 | t->target_oloc.pool = pi->write_tier; | ||
1342 | } | ||
1298 | 1343 | ||
1299 | list_del_init(&req->r_osd_item); | 1344 | ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid, |
1300 | maybe_move_osd_to_lru(osdc, req->r_osd); | 1345 | &t->target_oloc, &pgid); |
1301 | if (list_empty(&req->r_linger_osd_item)) | 1346 | if (ret) { |
1302 | req->r_osd = NULL; | 1347 | WARN_ON(ret != -ENOENT); |
1348 | t->osd = CEPH_HOMELESS_OSD; | ||
1349 | ct_res = CALC_TARGET_POOL_DNE; | ||
1350 | goto out; | ||
1351 | } | ||
1352 | last_pgid.pool = pgid.pool; | ||
1353 | last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask); | ||
1354 | |||
1355 | ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting); | ||
1356 | if (any_change && | ||
1357 | ceph_is_new_interval(&t->acting, | ||
1358 | &acting, | ||
1359 | &t->up, | ||
1360 | &up, | ||
1361 | t->size, | ||
1362 | pi->size, | ||
1363 | t->min_size, | ||
1364 | pi->min_size, | ||
1365 | t->pg_num, | ||
1366 | pi->pg_num, | ||
1367 | t->sort_bitwise, | ||
1368 | sort_bitwise, | ||
1369 | &last_pgid)) | ||
1370 | force_resend = true; | ||
1371 | |||
1372 | if (t->paused && !target_should_be_paused(osdc, t, pi)) { | ||
1373 | t->paused = false; | ||
1374 | need_resend = true; | ||
1303 | } | 1375 | } |
1304 | 1376 | ||
1305 | list_del_init(&req->r_req_lru_item); | 1377 | if (ceph_pg_compare(&t->pgid, &pgid) || |
1306 | ceph_osdc_put_request(req); | 1378 | ceph_osds_changed(&t->acting, &acting, any_change) || |
1379 | force_resend) { | ||
1380 | t->pgid = pgid; /* struct */ | ||
1381 | ceph_osds_copy(&t->acting, &acting); | ||
1382 | ceph_osds_copy(&t->up, &up); | ||
1383 | t->size = pi->size; | ||
1384 | t->min_size = pi->min_size; | ||
1385 | t->pg_num = pi->pg_num; | ||
1386 | t->pg_num_mask = pi->pg_num_mask; | ||
1387 | t->sort_bitwise = sort_bitwise; | ||
1388 | |||
1389 | t->osd = acting.primary; | ||
1390 | need_resend = true; | ||
1391 | } | ||
1392 | |||
1393 | ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION; | ||
1394 | out: | ||
1395 | dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd); | ||
1396 | return ct_res; | ||
1397 | } | ||
1398 | |||
1399 | static void setup_request_data(struct ceph_osd_request *req, | ||
1400 | struct ceph_msg *msg) | ||
1401 | { | ||
1402 | u32 data_len = 0; | ||
1403 | int i; | ||
1404 | |||
1405 | if (!list_empty(&msg->data)) | ||
1406 | return; | ||
1407 | |||
1408 | WARN_ON(msg->data_length); | ||
1409 | for (i = 0; i < req->r_num_ops; i++) { | ||
1410 | struct ceph_osd_req_op *op = &req->r_ops[i]; | ||
1411 | |||
1412 | switch (op->op) { | ||
1413 | /* request */ | ||
1414 | case CEPH_OSD_OP_WRITE: | ||
1415 | case CEPH_OSD_OP_WRITEFULL: | ||
1416 | WARN_ON(op->indata_len != op->extent.length); | ||
1417 | ceph_osdc_msg_data_add(msg, &op->extent.osd_data); | ||
1418 | break; | ||
1419 | case CEPH_OSD_OP_SETXATTR: | ||
1420 | case CEPH_OSD_OP_CMPXATTR: | ||
1421 | WARN_ON(op->indata_len != op->xattr.name_len + | ||
1422 | op->xattr.value_len); | ||
1423 | ceph_osdc_msg_data_add(msg, &op->xattr.osd_data); | ||
1424 | break; | ||
1425 | case CEPH_OSD_OP_NOTIFY_ACK: | ||
1426 | ceph_osdc_msg_data_add(msg, | ||
1427 | &op->notify_ack.request_data); | ||
1428 | break; | ||
1429 | |||
1430 | /* reply */ | ||
1431 | case CEPH_OSD_OP_STAT: | ||
1432 | ceph_osdc_msg_data_add(req->r_reply, | ||
1433 | &op->raw_data_in); | ||
1434 | break; | ||
1435 | case CEPH_OSD_OP_READ: | ||
1436 | ceph_osdc_msg_data_add(req->r_reply, | ||
1437 | &op->extent.osd_data); | ||
1438 | break; | ||
1439 | |||
1440 | /* both */ | ||
1441 | case CEPH_OSD_OP_CALL: | ||
1442 | WARN_ON(op->indata_len != op->cls.class_len + | ||
1443 | op->cls.method_len + | ||
1444 | op->cls.indata_len); | ||
1445 | ceph_osdc_msg_data_add(msg, &op->cls.request_info); | ||
1446 | /* optional, can be NONE */ | ||
1447 | ceph_osdc_msg_data_add(msg, &op->cls.request_data); | ||
1448 | /* optional, can be NONE */ | ||
1449 | ceph_osdc_msg_data_add(req->r_reply, | ||
1450 | &op->cls.response_data); | ||
1451 | break; | ||
1452 | case CEPH_OSD_OP_NOTIFY: | ||
1453 | ceph_osdc_msg_data_add(msg, | ||
1454 | &op->notify.request_data); | ||
1455 | ceph_osdc_msg_data_add(req->r_reply, | ||
1456 | &op->notify.response_data); | ||
1457 | break; | ||
1458 | } | ||
1459 | |||
1460 | data_len += op->indata_len; | ||
1461 | } | ||
1462 | |||
1463 | WARN_ON(data_len != msg->data_length); | ||
1464 | } | ||
1465 | |||
1466 | static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) | ||
1467 | { | ||
1468 | void *p = msg->front.iov_base; | ||
1469 | void *const end = p + msg->front_alloc_len; | ||
1470 | u32 data_len = 0; | ||
1471 | int i; | ||
1472 | |||
1473 | if (req->r_flags & CEPH_OSD_FLAG_WRITE) { | ||
1474 | /* snapshots aren't writeable */ | ||
1475 | WARN_ON(req->r_snapid != CEPH_NOSNAP); | ||
1476 | } else { | ||
1477 | WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec || | ||
1478 | req->r_data_offset || req->r_snapc); | ||
1479 | } | ||
1480 | |||
1481 | setup_request_data(req, msg); | ||
1482 | |||
1483 | ceph_encode_32(&p, 1); /* client_inc, always 1 */ | ||
1484 | ceph_encode_32(&p, req->r_osdc->osdmap->epoch); | ||
1485 | ceph_encode_32(&p, req->r_flags); | ||
1486 | ceph_encode_timespec(p, &req->r_mtime); | ||
1487 | p += sizeof(struct ceph_timespec); | ||
1488 | /* aka reassert_version */ | ||
1489 | memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version)); | ||
1490 | p += sizeof(req->r_replay_version); | ||
1491 | |||
1492 | /* oloc */ | ||
1493 | ceph_encode_8(&p, 4); | ||
1494 | ceph_encode_8(&p, 4); | ||
1495 | ceph_encode_32(&p, 8 + 4 + 4); | ||
1496 | ceph_encode_64(&p, req->r_t.target_oloc.pool); | ||
1497 | ceph_encode_32(&p, -1); /* preferred */ | ||
1498 | ceph_encode_32(&p, 0); /* key len */ | ||
1499 | |||
1500 | /* pgid */ | ||
1501 | ceph_encode_8(&p, 1); | ||
1502 | ceph_encode_64(&p, req->r_t.pgid.pool); | ||
1503 | ceph_encode_32(&p, req->r_t.pgid.seed); | ||
1504 | ceph_encode_32(&p, -1); /* preferred */ | ||
1505 | |||
1506 | /* oid */ | ||
1507 | ceph_encode_32(&p, req->r_t.target_oid.name_len); | ||
1508 | memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len); | ||
1509 | p += req->r_t.target_oid.name_len; | ||
1307 | 1510 | ||
1308 | if (osdc->num_requests == 0) { | 1511 | /* ops, can imply data */ |
1309 | dout(" no requests, canceling timeout\n"); | 1512 | ceph_encode_16(&p, req->r_num_ops); |
1310 | __cancel_osd_timeout(osdc); | 1513 | for (i = 0; i < req->r_num_ops; i++) { |
1514 | data_len += osd_req_encode_op(p, &req->r_ops[i]); | ||
1515 | p += sizeof(struct ceph_osd_op); | ||
1311 | } | 1516 | } |
1517 | |||
1518 | ceph_encode_64(&p, req->r_snapid); /* snapid */ | ||
1519 | if (req->r_snapc) { | ||
1520 | ceph_encode_64(&p, req->r_snapc->seq); | ||
1521 | ceph_encode_32(&p, req->r_snapc->num_snaps); | ||
1522 | for (i = 0; i < req->r_snapc->num_snaps; i++) | ||
1523 | ceph_encode_64(&p, req->r_snapc->snaps[i]); | ||
1524 | } else { | ||
1525 | ceph_encode_64(&p, 0); /* snap_seq */ | ||
1526 | ceph_encode_32(&p, 0); /* snaps len */ | ||
1527 | } | ||
1528 | |||
1529 | ceph_encode_32(&p, req->r_attempts); /* retry_attempt */ | ||
1530 | |||
1531 | BUG_ON(p > end); | ||
1532 | msg->front.iov_len = p - msg->front.iov_base; | ||
1533 | msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */ | ||
1534 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); | ||
1535 | msg->hdr.data_len = cpu_to_le32(data_len); | ||
1536 | /* | ||
1537 | * The header "data_off" is a hint to the receiver allowing it | ||
1538 | * to align received data into its buffers such that there's no | ||
1539 | * need to re-copy it before writing it to disk (direct I/O). | ||
1540 | */ | ||
1541 | msg->hdr.data_off = cpu_to_le16(req->r_data_offset); | ||
1542 | |||
1543 | dout("%s req %p oid %*pE oid_len %d front %zu data %u\n", __func__, | ||
1544 | req, req->r_t.target_oid.name_len, req->r_t.target_oid.name, | ||
1545 | req->r_t.target_oid.name_len, msg->front.iov_len, data_len); | ||
1312 | } | 1546 | } |
1313 | 1547 | ||
1314 | /* | 1548 | /* |
1315 | * Cancel a previously queued request message | 1549 | * @req has to be assigned a tid and registered. |
1316 | */ | 1550 | */ |
1317 | static void __cancel_request(struct ceph_osd_request *req) | 1551 | static void send_request(struct ceph_osd_request *req) |
1318 | { | 1552 | { |
1319 | if (req->r_sent && req->r_osd) { | 1553 | struct ceph_osd *osd = req->r_osd; |
1554 | |||
1555 | verify_osd_locked(osd); | ||
1556 | WARN_ON(osd->o_osd != req->r_t.osd); | ||
1557 | |||
1558 | /* | ||
1559 | * We may have a previously queued request message hanging | ||
1560 | * around. Cancel it to avoid corrupting the msgr. | ||
1561 | */ | ||
1562 | if (req->r_sent) | ||
1320 | ceph_msg_revoke(req->r_request); | 1563 | ceph_msg_revoke(req->r_request); |
1321 | req->r_sent = 0; | 1564 | |
1565 | req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR; | ||
1566 | if (req->r_attempts) | ||
1567 | req->r_flags |= CEPH_OSD_FLAG_RETRY; | ||
1568 | else | ||
1569 | WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY); | ||
1570 | |||
1571 | encode_request(req, req->r_request); | ||
1572 | |||
1573 | dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n", | ||
1574 | __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, | ||
1575 | req->r_t.osd, req->r_flags, req->r_attempts); | ||
1576 | |||
1577 | req->r_t.paused = false; | ||
1578 | req->r_stamp = jiffies; | ||
1579 | req->r_attempts++; | ||
1580 | |||
1581 | req->r_sent = osd->o_incarnation; | ||
1582 | req->r_request->hdr.tid = cpu_to_le64(req->r_tid); | ||
1583 | ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request)); | ||
1584 | } | ||
1585 | |||
1586 | static void maybe_request_map(struct ceph_osd_client *osdc) | ||
1587 | { | ||
1588 | bool continuous = false; | ||
1589 | |||
1590 | verify_osdc_locked(osdc); | ||
1591 | WARN_ON(!osdc->osdmap->epoch); | ||
1592 | |||
1593 | if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || | ||
1594 | ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || | ||
1595 | ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) { | ||
1596 | dout("%s osdc %p continuous\n", __func__, osdc); | ||
1597 | continuous = true; | ||
1598 | } else { | ||
1599 | dout("%s osdc %p onetime\n", __func__, osdc); | ||
1322 | } | 1600 | } |
1601 | |||
1602 | if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP, | ||
1603 | osdc->osdmap->epoch + 1, continuous)) | ||
1604 | ceph_monc_renew_subs(&osdc->client->monc); | ||
1323 | } | 1605 | } |
1324 | 1606 | ||
1325 | static void __register_linger_request(struct ceph_osd_client *osdc, | 1607 | static void send_map_check(struct ceph_osd_request *req); |
1326 | struct ceph_osd_request *req) | 1608 | |
1609 | static void __submit_request(struct ceph_osd_request *req, bool wrlocked) | ||
1327 | { | 1610 | { |
1328 | dout("%s %p tid %llu\n", __func__, req, req->r_tid); | 1611 | struct ceph_osd_client *osdc = req->r_osdc; |
1329 | WARN_ON(!req->r_linger); | 1612 | struct ceph_osd *osd; |
1613 | enum calc_target_result ct_res; | ||
1614 | bool need_send = false; | ||
1615 | bool promoted = false; | ||
1616 | |||
1617 | WARN_ON(req->r_tid || req->r_got_reply); | ||
1618 | dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); | ||
1619 | |||
1620 | again: | ||
1621 | ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false); | ||
1622 | if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) | ||
1623 | goto promote; | ||
1624 | |||
1625 | osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked); | ||
1626 | if (IS_ERR(osd)) { | ||
1627 | WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked); | ||
1628 | goto promote; | ||
1629 | } | ||
1630 | |||
1631 | if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && | ||
1632 | ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) { | ||
1633 | dout("req %p pausewr\n", req); | ||
1634 | req->r_t.paused = true; | ||
1635 | maybe_request_map(osdc); | ||
1636 | } else if ((req->r_flags & CEPH_OSD_FLAG_READ) && | ||
1637 | ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) { | ||
1638 | dout("req %p pauserd\n", req); | ||
1639 | req->r_t.paused = true; | ||
1640 | maybe_request_map(osdc); | ||
1641 | } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && | ||
1642 | !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY | | ||
1643 | CEPH_OSD_FLAG_FULL_FORCE)) && | ||
1644 | (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || | ||
1645 | pool_full(osdc, req->r_t.base_oloc.pool))) { | ||
1646 | dout("req %p full/pool_full\n", req); | ||
1647 | pr_warn_ratelimited("FULL or reached pool quota\n"); | ||
1648 | req->r_t.paused = true; | ||
1649 | maybe_request_map(osdc); | ||
1650 | } else if (!osd_homeless(osd)) { | ||
1651 | need_send = true; | ||
1652 | } else { | ||
1653 | maybe_request_map(osdc); | ||
1654 | } | ||
1655 | |||
1656 | mutex_lock(&osd->lock); | ||
1657 | /* | ||
1658 | * Assign the tid atomically with send_request() to protect | ||
1659 | * multiple writes to the same object from racing with each | ||
1660 | * other, resulting in out of order ops on the OSDs. | ||
1661 | */ | ||
1662 | req->r_tid = atomic64_inc_return(&osdc->last_tid); | ||
1663 | link_request(osd, req); | ||
1664 | if (need_send) | ||
1665 | send_request(req); | ||
1666 | mutex_unlock(&osd->lock); | ||
1330 | 1667 | ||
1668 | if (ct_res == CALC_TARGET_POOL_DNE) | ||
1669 | send_map_check(req); | ||
1670 | |||
1671 | if (promoted) | ||
1672 | downgrade_write(&osdc->lock); | ||
1673 | return; | ||
1674 | |||
1675 | promote: | ||
1676 | up_read(&osdc->lock); | ||
1677 | down_write(&osdc->lock); | ||
1678 | wrlocked = true; | ||
1679 | promoted = true; | ||
1680 | goto again; | ||
1681 | } | ||
1682 | |||
1683 | static void account_request(struct ceph_osd_request *req) | ||
1684 | { | ||
1685 | unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; | ||
1686 | |||
1687 | if (req->r_flags & CEPH_OSD_FLAG_READ) { | ||
1688 | WARN_ON(req->r_flags & mask); | ||
1689 | req->r_flags |= CEPH_OSD_FLAG_ACK; | ||
1690 | } else if (req->r_flags & CEPH_OSD_FLAG_WRITE) | ||
1691 | WARN_ON(!(req->r_flags & mask)); | ||
1692 | else | ||
1693 | WARN_ON(1); | ||
1694 | |||
1695 | WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask); | ||
1696 | atomic_inc(&req->r_osdc->num_requests); | ||
1697 | } | ||
1698 | |||
1699 | static void submit_request(struct ceph_osd_request *req, bool wrlocked) | ||
1700 | { | ||
1331 | ceph_osdc_get_request(req); | 1701 | ceph_osdc_get_request(req); |
1332 | list_add_tail(&req->r_linger_item, &osdc->req_linger); | 1702 | account_request(req); |
1333 | if (req->r_osd) | 1703 | __submit_request(req, wrlocked); |
1334 | list_add_tail(&req->r_linger_osd_item, | ||
1335 | &req->r_osd->o_linger_requests); | ||
1336 | } | 1704 | } |
1337 | 1705 | ||
1338 | static void __unregister_linger_request(struct ceph_osd_client *osdc, | 1706 | static void __finish_request(struct ceph_osd_request *req) |
1339 | struct ceph_osd_request *req) | ||
1340 | { | 1707 | { |
1341 | WARN_ON(!req->r_linger); | 1708 | struct ceph_osd_client *osdc = req->r_osdc; |
1709 | struct ceph_osd *osd = req->r_osd; | ||
1342 | 1710 | ||
1343 | if (list_empty(&req->r_linger_item)) { | 1711 | verify_osd_locked(osd); |
1344 | dout("%s %p tid %llu not registered\n", __func__, req, | 1712 | dout("%s req %p tid %llu\n", __func__, req, req->r_tid); |
1345 | req->r_tid); | 1713 | |
1714 | WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid)); | ||
1715 | unlink_request(osd, req); | ||
1716 | atomic_dec(&osdc->num_requests); | ||
1717 | |||
1718 | /* | ||
1719 | * If an OSD has failed or returned and a request has been sent | ||
1720 | * twice, it's possible to get a reply and end up here while the | ||
1721 | * request message is queued for delivery. We will ignore the | ||
1722 | * reply, so not a big deal, but better to try and catch it. | ||
1723 | */ | ||
1724 | ceph_msg_revoke(req->r_request); | ||
1725 | ceph_msg_revoke_incoming(req->r_reply); | ||
1726 | } | ||
1727 | |||
1728 | static void finish_request(struct ceph_osd_request *req) | ||
1729 | { | ||
1730 | __finish_request(req); | ||
1731 | ceph_osdc_put_request(req); | ||
1732 | } | ||
1733 | |||
1734 | static void __complete_request(struct ceph_osd_request *req) | ||
1735 | { | ||
1736 | if (req->r_callback) | ||
1737 | req->r_callback(req); | ||
1738 | else | ||
1739 | complete_all(&req->r_completion); | ||
1740 | } | ||
1741 | |||
1742 | /* | ||
1743 | * Note that this is open-coded in handle_reply(), which has to deal | ||
1744 | * with ack vs commit, dup acks, etc. | ||
1745 | */ | ||
1746 | static void complete_request(struct ceph_osd_request *req, int err) | ||
1747 | { | ||
1748 | dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err); | ||
1749 | |||
1750 | req->r_result = err; | ||
1751 | __finish_request(req); | ||
1752 | __complete_request(req); | ||
1753 | complete_all(&req->r_safe_completion); | ||
1754 | ceph_osdc_put_request(req); | ||
1755 | } | ||
1756 | |||
1757 | static void cancel_map_check(struct ceph_osd_request *req) | ||
1758 | { | ||
1759 | struct ceph_osd_client *osdc = req->r_osdc; | ||
1760 | struct ceph_osd_request *lookup_req; | ||
1761 | |||
1762 | verify_osdc_wrlocked(osdc); | ||
1763 | |||
1764 | lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid); | ||
1765 | if (!lookup_req) | ||
1346 | return; | 1766 | return; |
1767 | |||
1768 | WARN_ON(lookup_req != req); | ||
1769 | erase_request_mc(&osdc->map_checks, req); | ||
1770 | ceph_osdc_put_request(req); | ||
1771 | } | ||
1772 | |||
1773 | static void cancel_request(struct ceph_osd_request *req) | ||
1774 | { | ||
1775 | dout("%s req %p tid %llu\n", __func__, req, req->r_tid); | ||
1776 | |||
1777 | cancel_map_check(req); | ||
1778 | finish_request(req); | ||
1779 | } | ||
1780 | |||
1781 | static void check_pool_dne(struct ceph_osd_request *req) | ||
1782 | { | ||
1783 | struct ceph_osd_client *osdc = req->r_osdc; | ||
1784 | struct ceph_osdmap *map = osdc->osdmap; | ||
1785 | |||
1786 | verify_osdc_wrlocked(osdc); | ||
1787 | WARN_ON(!map->epoch); | ||
1788 | |||
1789 | if (req->r_attempts) { | ||
1790 | /* | ||
1791 | * We sent a request earlier, which means that | ||
1792 | * previously the pool existed, and now it does not | ||
1793 | * (i.e., it was deleted). | ||
1794 | */ | ||
1795 | req->r_map_dne_bound = map->epoch; | ||
1796 | dout("%s req %p tid %llu pool disappeared\n", __func__, req, | ||
1797 | req->r_tid); | ||
1798 | } else { | ||
1799 | dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__, | ||
1800 | req, req->r_tid, req->r_map_dne_bound, map->epoch); | ||
1347 | } | 1801 | } |
1348 | 1802 | ||
1349 | dout("%s %p tid %llu\n", __func__, req, req->r_tid); | 1803 | if (req->r_map_dne_bound) { |
1350 | list_del_init(&req->r_linger_item); | 1804 | if (map->epoch >= req->r_map_dne_bound) { |
1805 | /* we had a new enough map */ | ||
1806 | pr_info_ratelimited("tid %llu pool does not exist\n", | ||
1807 | req->r_tid); | ||
1808 | complete_request(req, -ENOENT); | ||
1809 | } | ||
1810 | } else { | ||
1811 | send_map_check(req); | ||
1812 | } | ||
1813 | } | ||
1351 | 1814 | ||
1352 | if (req->r_osd) { | 1815 | static void map_check_cb(struct ceph_mon_generic_request *greq) |
1353 | list_del_init(&req->r_linger_osd_item); | 1816 | { |
1354 | maybe_move_osd_to_lru(osdc, req->r_osd); | 1817 | struct ceph_osd_client *osdc = &greq->monc->client->osdc; |
1355 | if (list_empty(&req->r_osd_item)) | 1818 | struct ceph_osd_request *req; |
1356 | req->r_osd = NULL; | 1819 | u64 tid = greq->private_data; |
1820 | |||
1821 | WARN_ON(greq->result || !greq->u.newest); | ||
1822 | |||
1823 | down_write(&osdc->lock); | ||
1824 | req = lookup_request_mc(&osdc->map_checks, tid); | ||
1825 | if (!req) { | ||
1826 | dout("%s tid %llu dne\n", __func__, tid); | ||
1827 | goto out_unlock; | ||
1357 | } | 1828 | } |
1829 | |||
1830 | dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__, | ||
1831 | req, req->r_tid, req->r_map_dne_bound, greq->u.newest); | ||
1832 | if (!req->r_map_dne_bound) | ||
1833 | req->r_map_dne_bound = greq->u.newest; | ||
1834 | erase_request_mc(&osdc->map_checks, req); | ||
1835 | check_pool_dne(req); | ||
1836 | |||
1358 | ceph_osdc_put_request(req); | 1837 | ceph_osdc_put_request(req); |
1838 | out_unlock: | ||
1839 | up_write(&osdc->lock); | ||
1359 | } | 1840 | } |
1360 | 1841 | ||
1361 | void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, | 1842 | static void send_map_check(struct ceph_osd_request *req) |
1362 | struct ceph_osd_request *req) | ||
1363 | { | 1843 | { |
1364 | if (!req->r_linger) { | 1844 | struct ceph_osd_client *osdc = req->r_osdc; |
1365 | dout("set_request_linger %p\n", req); | 1845 | struct ceph_osd_request *lookup_req; |
1366 | req->r_linger = 1; | 1846 | int ret; |
1847 | |||
1848 | verify_osdc_wrlocked(osdc); | ||
1849 | |||
1850 | lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid); | ||
1851 | if (lookup_req) { | ||
1852 | WARN_ON(lookup_req != req); | ||
1853 | return; | ||
1367 | } | 1854 | } |
1855 | |||
1856 | ceph_osdc_get_request(req); | ||
1857 | insert_request_mc(&osdc->map_checks, req); | ||
1858 | ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap", | ||
1859 | map_check_cb, req->r_tid); | ||
1860 | WARN_ON(ret); | ||
1368 | } | 1861 | } |
1369 | EXPORT_SYMBOL(ceph_osdc_set_request_linger); | ||
1370 | 1862 | ||
1371 | /* | 1863 | /* |
1372 | * Returns whether a request should be blocked from being sent | 1864 | * lingering requests, watch/notify v2 infrastructure |
1373 | * based on the current osdmap and osd_client settings. | ||
1374 | * | ||
1375 | * Caller should hold map_sem for read. | ||
1376 | */ | 1865 | */ |
1377 | static bool __req_should_be_paused(struct ceph_osd_client *osdc, | 1866 | static void linger_release(struct kref *kref) |
1378 | struct ceph_osd_request *req) | ||
1379 | { | 1867 | { |
1380 | bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); | 1868 | struct ceph_osd_linger_request *lreq = |
1381 | bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || | 1869 | container_of(kref, struct ceph_osd_linger_request, kref); |
1382 | ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); | 1870 | |
1383 | return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) || | 1871 | dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq, |
1384 | (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); | 1872 | lreq->reg_req, lreq->ping_req); |
1873 | WARN_ON(!RB_EMPTY_NODE(&lreq->node)); | ||
1874 | WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node)); | ||
1875 | WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node)); | ||
1876 | WARN_ON(!list_empty(&lreq->scan_item)); | ||
1877 | WARN_ON(!list_empty(&lreq->pending_lworks)); | ||
1878 | WARN_ON(lreq->osd); | ||
1879 | |||
1880 | if (lreq->reg_req) | ||
1881 | ceph_osdc_put_request(lreq->reg_req); | ||
1882 | if (lreq->ping_req) | ||
1883 | ceph_osdc_put_request(lreq->ping_req); | ||
1884 | target_destroy(&lreq->t); | ||
1885 | kfree(lreq); | ||
1385 | } | 1886 | } |
1386 | 1887 | ||
1888 | static void linger_put(struct ceph_osd_linger_request *lreq) | ||
1889 | { | ||
1890 | if (lreq) | ||
1891 | kref_put(&lreq->kref, linger_release); | ||
1892 | } | ||
1893 | |||
1894 | static struct ceph_osd_linger_request * | ||
1895 | linger_get(struct ceph_osd_linger_request *lreq) | ||
1896 | { | ||
1897 | kref_get(&lreq->kref); | ||
1898 | return lreq; | ||
1899 | } | ||
1900 | |||
1901 | static struct ceph_osd_linger_request * | ||
1902 | linger_alloc(struct ceph_osd_client *osdc) | ||
1903 | { | ||
1904 | struct ceph_osd_linger_request *lreq; | ||
1905 | |||
1906 | lreq = kzalloc(sizeof(*lreq), GFP_NOIO); | ||
1907 | if (!lreq) | ||
1908 | return NULL; | ||
1909 | |||
1910 | kref_init(&lreq->kref); | ||
1911 | mutex_init(&lreq->lock); | ||
1912 | RB_CLEAR_NODE(&lreq->node); | ||
1913 | RB_CLEAR_NODE(&lreq->osdc_node); | ||
1914 | RB_CLEAR_NODE(&lreq->mc_node); | ||
1915 | INIT_LIST_HEAD(&lreq->scan_item); | ||
1916 | INIT_LIST_HEAD(&lreq->pending_lworks); | ||
1917 | init_completion(&lreq->reg_commit_wait); | ||
1918 | init_completion(&lreq->notify_finish_wait); | ||
1919 | |||
1920 | lreq->osdc = osdc; | ||
1921 | target_init(&lreq->t); | ||
1922 | |||
1923 | dout("%s lreq %p\n", __func__, lreq); | ||
1924 | return lreq; | ||
1925 | } | ||
1926 | |||
1927 | DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node) | ||
1928 | DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node) | ||
1929 | DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node) | ||
1930 | |||
1387 | /* | 1931 | /* |
1388 | * Calculate mapping of a request to a PG. Takes tiering into account. | 1932 | * Create linger request <-> OSD session relation. |
1933 | * | ||
1934 | * @lreq has to be registered, @osd may be homeless. | ||
1389 | */ | 1935 | */ |
1390 | static int __calc_request_pg(struct ceph_osdmap *osdmap, | 1936 | static void link_linger(struct ceph_osd *osd, |
1391 | struct ceph_osd_request *req, | 1937 | struct ceph_osd_linger_request *lreq) |
1392 | struct ceph_pg *pg_out) | ||
1393 | { | 1938 | { |
1394 | bool need_check_tiering; | 1939 | verify_osd_locked(osd); |
1940 | WARN_ON(!lreq->linger_id || lreq->osd); | ||
1941 | dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd, | ||
1942 | osd->o_osd, lreq, lreq->linger_id); | ||
1395 | 1943 | ||
1396 | need_check_tiering = false; | 1944 | if (!osd_homeless(osd)) |
1397 | if (req->r_target_oloc.pool == -1) { | 1945 | __remove_osd_from_lru(osd); |
1398 | req->r_target_oloc = req->r_base_oloc; /* struct */ | 1946 | else |
1399 | need_check_tiering = true; | 1947 | atomic_inc(&osd->o_osdc->num_homeless); |
1948 | |||
1949 | get_osd(osd); | ||
1950 | insert_linger(&osd->o_linger_requests, lreq); | ||
1951 | lreq->osd = osd; | ||
1952 | } | ||
1953 | |||
1954 | static void unlink_linger(struct ceph_osd *osd, | ||
1955 | struct ceph_osd_linger_request *lreq) | ||
1956 | { | ||
1957 | verify_osd_locked(osd); | ||
1958 | WARN_ON(lreq->osd != osd); | ||
1959 | dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd, | ||
1960 | osd->o_osd, lreq, lreq->linger_id); | ||
1961 | |||
1962 | lreq->osd = NULL; | ||
1963 | erase_linger(&osd->o_linger_requests, lreq); | ||
1964 | put_osd(osd); | ||
1965 | |||
1966 | if (!osd_homeless(osd)) | ||
1967 | maybe_move_osd_to_lru(osd); | ||
1968 | else | ||
1969 | atomic_dec(&osd->o_osdc->num_homeless); | ||
1970 | } | ||
1971 | |||
1972 | static bool __linger_registered(struct ceph_osd_linger_request *lreq) | ||
1973 | { | ||
1974 | verify_osdc_locked(lreq->osdc); | ||
1975 | |||
1976 | return !RB_EMPTY_NODE(&lreq->osdc_node); | ||
1977 | } | ||
1978 | |||
1979 | static bool linger_registered(struct ceph_osd_linger_request *lreq) | ||
1980 | { | ||
1981 | struct ceph_osd_client *osdc = lreq->osdc; | ||
1982 | bool registered; | ||
1983 | |||
1984 | down_read(&osdc->lock); | ||
1985 | registered = __linger_registered(lreq); | ||
1986 | up_read(&osdc->lock); | ||
1987 | |||
1988 | return registered; | ||
1989 | } | ||
1990 | |||
1991 | static void linger_register(struct ceph_osd_linger_request *lreq) | ||
1992 | { | ||
1993 | struct ceph_osd_client *osdc = lreq->osdc; | ||
1994 | |||
1995 | verify_osdc_wrlocked(osdc); | ||
1996 | WARN_ON(lreq->linger_id); | ||
1997 | |||
1998 | linger_get(lreq); | ||
1999 | lreq->linger_id = ++osdc->last_linger_id; | ||
2000 | insert_linger_osdc(&osdc->linger_requests, lreq); | ||
2001 | } | ||
2002 | |||
2003 | static void linger_unregister(struct ceph_osd_linger_request *lreq) | ||
2004 | { | ||
2005 | struct ceph_osd_client *osdc = lreq->osdc; | ||
2006 | |||
2007 | verify_osdc_wrlocked(osdc); | ||
2008 | |||
2009 | erase_linger_osdc(&osdc->linger_requests, lreq); | ||
2010 | linger_put(lreq); | ||
2011 | } | ||
2012 | |||
2013 | static void cancel_linger_request(struct ceph_osd_request *req) | ||
2014 | { | ||
2015 | struct ceph_osd_linger_request *lreq = req->r_priv; | ||
2016 | |||
2017 | WARN_ON(!req->r_linger); | ||
2018 | cancel_request(req); | ||
2019 | linger_put(lreq); | ||
2020 | } | ||
2021 | |||
2022 | struct linger_work { | ||
2023 | struct work_struct work; | ||
2024 | struct ceph_osd_linger_request *lreq; | ||
2025 | struct list_head pending_item; | ||
2026 | unsigned long queued_stamp; | ||
2027 | |||
2028 | union { | ||
2029 | struct { | ||
2030 | u64 notify_id; | ||
2031 | u64 notifier_id; | ||
2032 | void *payload; /* points into @msg front */ | ||
2033 | size_t payload_len; | ||
2034 | |||
2035 | struct ceph_msg *msg; /* for ceph_msg_put() */ | ||
2036 | } notify; | ||
2037 | struct { | ||
2038 | int err; | ||
2039 | } error; | ||
2040 | }; | ||
2041 | }; | ||
2042 | |||
2043 | static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq, | ||
2044 | work_func_t workfn) | ||
2045 | { | ||
2046 | struct linger_work *lwork; | ||
2047 | |||
2048 | lwork = kzalloc(sizeof(*lwork), GFP_NOIO); | ||
2049 | if (!lwork) | ||
2050 | return NULL; | ||
2051 | |||
2052 | INIT_WORK(&lwork->work, workfn); | ||
2053 | INIT_LIST_HEAD(&lwork->pending_item); | ||
2054 | lwork->lreq = linger_get(lreq); | ||
2055 | |||
2056 | return lwork; | ||
2057 | } | ||
2058 | |||
2059 | static void lwork_free(struct linger_work *lwork) | ||
2060 | { | ||
2061 | struct ceph_osd_linger_request *lreq = lwork->lreq; | ||
2062 | |||
2063 | mutex_lock(&lreq->lock); | ||
2064 | list_del(&lwork->pending_item); | ||
2065 | mutex_unlock(&lreq->lock); | ||
2066 | |||
2067 | linger_put(lreq); | ||
2068 | kfree(lwork); | ||
2069 | } | ||
2070 | |||
2071 | static void lwork_queue(struct linger_work *lwork) | ||
2072 | { | ||
2073 | struct ceph_osd_linger_request *lreq = lwork->lreq; | ||
2074 | struct ceph_osd_client *osdc = lreq->osdc; | ||
2075 | |||
2076 | verify_lreq_locked(lreq); | ||
2077 | WARN_ON(!list_empty(&lwork->pending_item)); | ||
2078 | |||
2079 | lwork->queued_stamp = jiffies; | ||
2080 | list_add_tail(&lwork->pending_item, &lreq->pending_lworks); | ||
2081 | queue_work(osdc->notify_wq, &lwork->work); | ||
2082 | } | ||
2083 | |||
2084 | static void do_watch_notify(struct work_struct *w) | ||
2085 | { | ||
2086 | struct linger_work *lwork = container_of(w, struct linger_work, work); | ||
2087 | struct ceph_osd_linger_request *lreq = lwork->lreq; | ||
2088 | |||
2089 | if (!linger_registered(lreq)) { | ||
2090 | dout("%s lreq %p not registered\n", __func__, lreq); | ||
2091 | goto out; | ||
1400 | } | 2092 | } |
1401 | if (req->r_target_oid.name_len == 0) { | 2093 | |
1402 | ceph_oid_copy(&req->r_target_oid, &req->r_base_oid); | 2094 | WARN_ON(!lreq->is_watch); |
1403 | need_check_tiering = true; | 2095 | dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n", |
2096 | __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id, | ||
2097 | lwork->notify.payload_len); | ||
2098 | lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id, | ||
2099 | lwork->notify.notifier_id, lwork->notify.payload, | ||
2100 | lwork->notify.payload_len); | ||
2101 | |||
2102 | out: | ||
2103 | ceph_msg_put(lwork->notify.msg); | ||
2104 | lwork_free(lwork); | ||
2105 | } | ||
2106 | |||
2107 | static void do_watch_error(struct work_struct *w) | ||
2108 | { | ||
2109 | struct linger_work *lwork = container_of(w, struct linger_work, work); | ||
2110 | struct ceph_osd_linger_request *lreq = lwork->lreq; | ||
2111 | |||
2112 | if (!linger_registered(lreq)) { | ||
2113 | dout("%s lreq %p not registered\n", __func__, lreq); | ||
2114 | goto out; | ||
1404 | } | 2115 | } |
1405 | 2116 | ||
1406 | if (need_check_tiering && | 2117 | dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err); |
1407 | (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { | 2118 | lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err); |
1408 | struct ceph_pg_pool_info *pi; | 2119 | |
1409 | 2120 | out: | |
1410 | pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool); | 2121 | lwork_free(lwork); |
1411 | if (pi) { | 2122 | } |
1412 | if ((req->r_flags & CEPH_OSD_FLAG_READ) && | 2123 | |
1413 | pi->read_tier >= 0) | 2124 | static void queue_watch_error(struct ceph_osd_linger_request *lreq) |
1414 | req->r_target_oloc.pool = pi->read_tier; | 2125 | { |
1415 | if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && | 2126 | struct linger_work *lwork; |
1416 | pi->write_tier >= 0) | 2127 | |
1417 | req->r_target_oloc.pool = pi->write_tier; | 2128 | lwork = lwork_alloc(lreq, do_watch_error); |
2129 | if (!lwork) { | ||
2130 | pr_err("failed to allocate error-lwork\n"); | ||
2131 | return; | ||
2132 | } | ||
2133 | |||
2134 | lwork->error.err = lreq->last_error; | ||
2135 | lwork_queue(lwork); | ||
2136 | } | ||
2137 | |||
2138 | static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq, | ||
2139 | int result) | ||
2140 | { | ||
2141 | if (!completion_done(&lreq->reg_commit_wait)) { | ||
2142 | lreq->reg_commit_error = (result <= 0 ? result : 0); | ||
2143 | complete_all(&lreq->reg_commit_wait); | ||
2144 | } | ||
2145 | } | ||
2146 | |||
2147 | static void linger_commit_cb(struct ceph_osd_request *req) | ||
2148 | { | ||
2149 | struct ceph_osd_linger_request *lreq = req->r_priv; | ||
2150 | |||
2151 | mutex_lock(&lreq->lock); | ||
2152 | dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, | ||
2153 | lreq->linger_id, req->r_result); | ||
2154 | WARN_ON(!__linger_registered(lreq)); | ||
2155 | linger_reg_commit_complete(lreq, req->r_result); | ||
2156 | lreq->committed = true; | ||
2157 | |||
2158 | if (!lreq->is_watch) { | ||
2159 | struct ceph_osd_data *osd_data = | ||
2160 | osd_req_op_data(req, 0, notify, response_data); | ||
2161 | void *p = page_address(osd_data->pages[0]); | ||
2162 | |||
2163 | WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY || | ||
2164 | osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); | ||
2165 | |||
2166 | /* make note of the notify_id */ | ||
2167 | if (req->r_ops[0].outdata_len >= sizeof(u64)) { | ||
2168 | lreq->notify_id = ceph_decode_64(&p); | ||
2169 | dout("lreq %p notify_id %llu\n", lreq, | ||
2170 | lreq->notify_id); | ||
2171 | } else { | ||
2172 | dout("lreq %p no notify_id\n", lreq); | ||
1418 | } | 2173 | } |
1419 | /* !pi is caught in ceph_oloc_oid_to_pg() */ | ||
1420 | } | 2174 | } |
1421 | 2175 | ||
1422 | return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc, | 2176 | mutex_unlock(&lreq->lock); |
1423 | &req->r_target_oid, pg_out); | 2177 | linger_put(lreq); |
1424 | } | 2178 | } |
1425 | 2179 | ||
1426 | static void __enqueue_request(struct ceph_osd_request *req) | 2180 | static int normalize_watch_error(int err) |
1427 | { | 2181 | { |
1428 | struct ceph_osd_client *osdc = req->r_osdc; | 2182 | /* |
2183 | * Translate ENOENT -> ENOTCONN so that a delete->disconnection | ||
2184 | * notification and a failure to reconnect because we raced with | ||
2185 | * the delete appear the same to the user. | ||
2186 | */ | ||
2187 | if (err == -ENOENT) | ||
2188 | err = -ENOTCONN; | ||
2189 | |||
2190 | return err; | ||
2191 | } | ||
2192 | |||
2193 | static void linger_reconnect_cb(struct ceph_osd_request *req) | ||
2194 | { | ||
2195 | struct ceph_osd_linger_request *lreq = req->r_priv; | ||
2196 | |||
2197 | mutex_lock(&lreq->lock); | ||
2198 | dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__, | ||
2199 | lreq, lreq->linger_id, req->r_result, lreq->last_error); | ||
2200 | if (req->r_result < 0) { | ||
2201 | if (!lreq->last_error) { | ||
2202 | lreq->last_error = normalize_watch_error(req->r_result); | ||
2203 | queue_watch_error(lreq); | ||
2204 | } | ||
2205 | } | ||
1429 | 2206 | ||
1430 | dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid, | 2207 | mutex_unlock(&lreq->lock); |
1431 | req->r_osd ? req->r_osd->o_osd : -1); | 2208 | linger_put(lreq); |
2209 | } | ||
2210 | |||
2211 | static void send_linger(struct ceph_osd_linger_request *lreq) | ||
2212 | { | ||
2213 | struct ceph_osd_request *req = lreq->reg_req; | ||
2214 | struct ceph_osd_req_op *op = &req->r_ops[0]; | ||
1432 | 2215 | ||
1433 | if (req->r_osd) { | 2216 | verify_osdc_wrlocked(req->r_osdc); |
1434 | __remove_osd_from_lru(req->r_osd); | 2217 | dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); |
1435 | list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); | 2218 | |
1436 | list_move_tail(&req->r_req_lru_item, &osdc->req_unsent); | 2219 | if (req->r_osd) |
2220 | cancel_linger_request(req); | ||
2221 | |||
2222 | request_reinit(req); | ||
2223 | ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); | ||
2224 | ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); | ||
2225 | req->r_flags = lreq->t.flags; | ||
2226 | req->r_mtime = lreq->mtime; | ||
2227 | |||
2228 | mutex_lock(&lreq->lock); | ||
2229 | if (lreq->is_watch && lreq->committed) { | ||
2230 | WARN_ON(op->op != CEPH_OSD_OP_WATCH || | ||
2231 | op->watch.cookie != lreq->linger_id); | ||
2232 | op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT; | ||
2233 | op->watch.gen = ++lreq->register_gen; | ||
2234 | dout("lreq %p reconnect register_gen %u\n", lreq, | ||
2235 | op->watch.gen); | ||
2236 | req->r_callback = linger_reconnect_cb; | ||
1437 | } else { | 2237 | } else { |
1438 | list_move_tail(&req->r_req_lru_item, &osdc->req_notarget); | 2238 | if (!lreq->is_watch) |
2239 | lreq->notify_id = 0; | ||
2240 | else | ||
2241 | WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH); | ||
2242 | dout("lreq %p register\n", lreq); | ||
2243 | req->r_callback = linger_commit_cb; | ||
1439 | } | 2244 | } |
2245 | mutex_unlock(&lreq->lock); | ||
2246 | |||
2247 | req->r_priv = linger_get(lreq); | ||
2248 | req->r_linger = true; | ||
2249 | |||
2250 | submit_request(req, true); | ||
1440 | } | 2251 | } |
1441 | 2252 | ||
1442 | /* | 2253 | static void linger_ping_cb(struct ceph_osd_request *req) |
1443 | * Pick an osd (the first 'up' osd in the pg), allocate the osd struct | ||
1444 | * (as needed), and set the request r_osd appropriately. If there is | ||
1445 | * no up osd, set r_osd to NULL. Move the request to the appropriate list | ||
1446 | * (unsent, homeless) or leave on in-flight lru. | ||
1447 | * | ||
1448 | * Return 0 if unchanged, 1 if changed, or negative on error. | ||
1449 | * | ||
1450 | * Caller should hold map_sem for read and request_mutex. | ||
1451 | */ | ||
1452 | static int __map_request(struct ceph_osd_client *osdc, | ||
1453 | struct ceph_osd_request *req, int force_resend) | ||
1454 | { | 2254 | { |
1455 | struct ceph_pg pgid; | 2255 | struct ceph_osd_linger_request *lreq = req->r_priv; |
1456 | int acting[CEPH_PG_MAX_SIZE]; | 2256 | |
1457 | int num, o; | 2257 | mutex_lock(&lreq->lock); |
1458 | int err; | 2258 | dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n", |
1459 | bool was_paused; | 2259 | __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent, |
1460 | 2260 | lreq->last_error); | |
1461 | dout("map_request %p tid %lld\n", req, req->r_tid); | 2261 | if (lreq->register_gen == req->r_ops[0].watch.gen) { |
1462 | 2262 | if (!req->r_result) { | |
1463 | err = __calc_request_pg(osdc->osdmap, req, &pgid); | 2263 | lreq->watch_valid_thru = lreq->ping_sent; |
1464 | if (err) { | 2264 | } else if (!lreq->last_error) { |
1465 | list_move(&req->r_req_lru_item, &osdc->req_notarget); | 2265 | lreq->last_error = normalize_watch_error(req->r_result); |
1466 | return err; | 2266 | queue_watch_error(lreq); |
1467 | } | ||
1468 | req->r_pgid = pgid; | ||
1469 | |||
1470 | num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o); | ||
1471 | if (num < 0) | ||
1472 | num = 0; | ||
1473 | |||
1474 | was_paused = req->r_paused; | ||
1475 | req->r_paused = __req_should_be_paused(osdc, req); | ||
1476 | if (was_paused && !req->r_paused) | ||
1477 | force_resend = 1; | ||
1478 | |||
1479 | if ((!force_resend && | ||
1480 | req->r_osd && req->r_osd->o_osd == o && | ||
1481 | req->r_sent >= req->r_osd->o_incarnation && | ||
1482 | req->r_num_pg_osds == num && | ||
1483 | memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || | ||
1484 | (req->r_osd == NULL && o == -1) || | ||
1485 | req->r_paused) | ||
1486 | return 0; /* no change */ | ||
1487 | |||
1488 | dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", | ||
1489 | req->r_tid, pgid.pool, pgid.seed, o, | ||
1490 | req->r_osd ? req->r_osd->o_osd : -1); | ||
1491 | |||
1492 | /* record full pg acting set */ | ||
1493 | memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num); | ||
1494 | req->r_num_pg_osds = num; | ||
1495 | |||
1496 | if (req->r_osd) { | ||
1497 | __cancel_request(req); | ||
1498 | list_del_init(&req->r_osd_item); | ||
1499 | list_del_init(&req->r_linger_osd_item); | ||
1500 | req->r_osd = NULL; | ||
1501 | } | ||
1502 | |||
1503 | req->r_osd = __lookup_osd(osdc, o); | ||
1504 | if (!req->r_osd && o >= 0) { | ||
1505 | err = -ENOMEM; | ||
1506 | req->r_osd = create_osd(osdc, o); | ||
1507 | if (!req->r_osd) { | ||
1508 | list_move(&req->r_req_lru_item, &osdc->req_notarget); | ||
1509 | goto out; | ||
1510 | } | 2267 | } |
2268 | } else { | ||
2269 | dout("lreq %p register_gen %u ignoring old pong %u\n", lreq, | ||
2270 | lreq->register_gen, req->r_ops[0].watch.gen); | ||
2271 | } | ||
1511 | 2272 | ||
1512 | dout("map_request osd %p is osd%d\n", req->r_osd, o); | 2273 | mutex_unlock(&lreq->lock); |
1513 | __insert_osd(osdc, req->r_osd); | 2274 | linger_put(lreq); |
2275 | } | ||
2276 | |||
2277 | static void send_linger_ping(struct ceph_osd_linger_request *lreq) | ||
2278 | { | ||
2279 | struct ceph_osd_client *osdc = lreq->osdc; | ||
2280 | struct ceph_osd_request *req = lreq->ping_req; | ||
2281 | struct ceph_osd_req_op *op = &req->r_ops[0]; | ||
1514 | 2282 | ||
1515 | ceph_con_open(&req->r_osd->o_con, | 2283 | if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) { |
1516 | CEPH_ENTITY_TYPE_OSD, o, | 2284 | dout("%s PAUSERD\n", __func__); |
1517 | &osdc->osdmap->osd_addr[o]); | 2285 | return; |
1518 | } | 2286 | } |
1519 | 2287 | ||
1520 | __enqueue_request(req); | 2288 | lreq->ping_sent = jiffies; |
1521 | err = 1; /* osd or pg changed */ | 2289 | dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n", |
2290 | __func__, lreq, lreq->linger_id, lreq->ping_sent, | ||
2291 | lreq->register_gen); | ||
1522 | 2292 | ||
1523 | out: | 2293 | if (req->r_osd) |
1524 | return err; | 2294 | cancel_linger_request(req); |
2295 | |||
2296 | request_reinit(req); | ||
2297 | target_copy(&req->r_t, &lreq->t); | ||
2298 | |||
2299 | WARN_ON(op->op != CEPH_OSD_OP_WATCH || | ||
2300 | op->watch.cookie != lreq->linger_id || | ||
2301 | op->watch.op != CEPH_OSD_WATCH_OP_PING); | ||
2302 | op->watch.gen = lreq->register_gen; | ||
2303 | req->r_callback = linger_ping_cb; | ||
2304 | req->r_priv = linger_get(lreq); | ||
2305 | req->r_linger = true; | ||
2306 | |||
2307 | ceph_osdc_get_request(req); | ||
2308 | account_request(req); | ||
2309 | req->r_tid = atomic64_inc_return(&osdc->last_tid); | ||
2310 | link_request(lreq->osd, req); | ||
2311 | send_request(req); | ||
1525 | } | 2312 | } |
1526 | 2313 | ||
1527 | /* | 2314 | static void linger_submit(struct ceph_osd_linger_request *lreq) |
1528 | * caller should hold map_sem (for read) and request_mutex | ||
1529 | */ | ||
1530 | static void __send_request(struct ceph_osd_client *osdc, | ||
1531 | struct ceph_osd_request *req) | ||
1532 | { | 2315 | { |
1533 | void *p; | 2316 | struct ceph_osd_client *osdc = lreq->osdc; |
2317 | struct ceph_osd *osd; | ||
1534 | 2318 | ||
1535 | dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n", | 2319 | calc_target(osdc, &lreq->t, &lreq->last_force_resend, false); |
1536 | req, req->r_tid, req->r_osd->o_osd, req->r_flags, | 2320 | osd = lookup_create_osd(osdc, lreq->t.osd, true); |
1537 | (unsigned long long)req->r_pgid.pool, req->r_pgid.seed); | 2321 | link_linger(osd, lreq); |
1538 | 2322 | ||
1539 | /* fill in message content that changes each time we send it */ | 2323 | send_linger(lreq); |
1540 | put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); | 2324 | } |
1541 | put_unaligned_le32(req->r_flags, req->r_request_flags); | ||
1542 | put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool); | ||
1543 | p = req->r_request_pgid; | ||
1544 | ceph_encode_64(&p, req->r_pgid.pool); | ||
1545 | ceph_encode_32(&p, req->r_pgid.seed); | ||
1546 | put_unaligned_le64(1, req->r_request_attempts); /* FIXME */ | ||
1547 | memcpy(req->r_request_reassert_version, &req->r_reassert_version, | ||
1548 | sizeof(req->r_reassert_version)); | ||
1549 | 2325 | ||
1550 | req->r_stamp = jiffies; | 2326 | static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq) |
1551 | list_move_tail(&req->r_req_lru_item, &osdc->req_lru); | 2327 | { |
2328 | struct ceph_osd_client *osdc = lreq->osdc; | ||
2329 | struct ceph_osd_linger_request *lookup_lreq; | ||
1552 | 2330 | ||
1553 | ceph_msg_get(req->r_request); /* send consumes a ref */ | 2331 | verify_osdc_wrlocked(osdc); |
1554 | 2332 | ||
1555 | req->r_sent = req->r_osd->o_incarnation; | 2333 | lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks, |
2334 | lreq->linger_id); | ||
2335 | if (!lookup_lreq) | ||
2336 | return; | ||
1556 | 2337 | ||
1557 | ceph_con_send(&req->r_osd->o_con, req->r_request); | 2338 | WARN_ON(lookup_lreq != lreq); |
2339 | erase_linger_mc(&osdc->linger_map_checks, lreq); | ||
2340 | linger_put(lreq); | ||
1558 | } | 2341 | } |
1559 | 2342 | ||
1560 | /* | 2343 | /* |
1561 | * Send any requests in the queue (req_unsent). | 2344 | * @lreq has to be both registered and linked. |
1562 | */ | 2345 | */ |
1563 | static void __send_queued(struct ceph_osd_client *osdc) | 2346 | static void __linger_cancel(struct ceph_osd_linger_request *lreq) |
2347 | { | ||
2348 | if (lreq->is_watch && lreq->ping_req->r_osd) | ||
2349 | cancel_linger_request(lreq->ping_req); | ||
2350 | if (lreq->reg_req->r_osd) | ||
2351 | cancel_linger_request(lreq->reg_req); | ||
2352 | cancel_linger_map_check(lreq); | ||
2353 | unlink_linger(lreq->osd, lreq); | ||
2354 | linger_unregister(lreq); | ||
2355 | } | ||
2356 | |||
2357 | static void linger_cancel(struct ceph_osd_linger_request *lreq) | ||
1564 | { | 2358 | { |
1565 | struct ceph_osd_request *req, *tmp; | 2359 | struct ceph_osd_client *osdc = lreq->osdc; |
1566 | 2360 | ||
1567 | dout("__send_queued\n"); | 2361 | down_write(&osdc->lock); |
1568 | list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) | 2362 | if (__linger_registered(lreq)) |
1569 | __send_request(osdc, req); | 2363 | __linger_cancel(lreq); |
2364 | up_write(&osdc->lock); | ||
1570 | } | 2365 | } |
1571 | 2366 | ||
1572 | /* | 2367 | static void send_linger_map_check(struct ceph_osd_linger_request *lreq); |
1573 | * Caller should hold map_sem for read and request_mutex. | 2368 | |
1574 | */ | 2369 | static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq) |
1575 | static int __ceph_osdc_start_request(struct ceph_osd_client *osdc, | 2370 | { |
1576 | struct ceph_osd_request *req, | 2371 | struct ceph_osd_client *osdc = lreq->osdc; |
1577 | bool nofail) | 2372 | struct ceph_osdmap *map = osdc->osdmap; |
1578 | { | 2373 | |
1579 | int rc; | 2374 | verify_osdc_wrlocked(osdc); |
1580 | 2375 | WARN_ON(!map->epoch); | |
1581 | __register_request(osdc, req); | 2376 | |
1582 | req->r_sent = 0; | 2377 | if (lreq->register_gen) { |
1583 | req->r_got_reply = 0; | 2378 | lreq->map_dne_bound = map->epoch; |
1584 | rc = __map_request(osdc, req, 0); | 2379 | dout("%s lreq %p linger_id %llu pool disappeared\n", __func__, |
1585 | if (rc < 0) { | 2380 | lreq, lreq->linger_id); |
1586 | if (nofail) { | 2381 | } else { |
1587 | dout("osdc_start_request failed map, " | 2382 | dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n", |
1588 | " will retry %lld\n", req->r_tid); | 2383 | __func__, lreq, lreq->linger_id, lreq->map_dne_bound, |
1589 | rc = 0; | 2384 | map->epoch); |
1590 | } else { | ||
1591 | __unregister_request(osdc, req); | ||
1592 | } | ||
1593 | return rc; | ||
1594 | } | 2385 | } |
1595 | 2386 | ||
1596 | if (req->r_osd == NULL) { | 2387 | if (lreq->map_dne_bound) { |
1597 | dout("send_request %p no up osds in pg\n", req); | 2388 | if (map->epoch >= lreq->map_dne_bound) { |
1598 | ceph_monc_request_next_osdmap(&osdc->client->monc); | 2389 | /* we had a new enough map */ |
2390 | pr_info("linger_id %llu pool does not exist\n", | ||
2391 | lreq->linger_id); | ||
2392 | linger_reg_commit_complete(lreq, -ENOENT); | ||
2393 | __linger_cancel(lreq); | ||
2394 | } | ||
1599 | } else { | 2395 | } else { |
1600 | __send_queued(osdc); | 2396 | send_linger_map_check(lreq); |
1601 | } | 2397 | } |
2398 | } | ||
1602 | 2399 | ||
1603 | return 0; | 2400 | static void linger_map_check_cb(struct ceph_mon_generic_request *greq) |
2401 | { | ||
2402 | struct ceph_osd_client *osdc = &greq->monc->client->osdc; | ||
2403 | struct ceph_osd_linger_request *lreq; | ||
2404 | u64 linger_id = greq->private_data; | ||
2405 | |||
2406 | WARN_ON(greq->result || !greq->u.newest); | ||
2407 | |||
2408 | down_write(&osdc->lock); | ||
2409 | lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id); | ||
2410 | if (!lreq) { | ||
2411 | dout("%s linger_id %llu dne\n", __func__, linger_id); | ||
2412 | goto out_unlock; | ||
2413 | } | ||
2414 | |||
2415 | dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n", | ||
2416 | __func__, lreq, lreq->linger_id, lreq->map_dne_bound, | ||
2417 | greq->u.newest); | ||
2418 | if (!lreq->map_dne_bound) | ||
2419 | lreq->map_dne_bound = greq->u.newest; | ||
2420 | erase_linger_mc(&osdc->linger_map_checks, lreq); | ||
2421 | check_linger_pool_dne(lreq); | ||
2422 | |||
2423 | linger_put(lreq); | ||
2424 | out_unlock: | ||
2425 | up_write(&osdc->lock); | ||
2426 | } | ||
2427 | |||
2428 | static void send_linger_map_check(struct ceph_osd_linger_request *lreq) | ||
2429 | { | ||
2430 | struct ceph_osd_client *osdc = lreq->osdc; | ||
2431 | struct ceph_osd_linger_request *lookup_lreq; | ||
2432 | int ret; | ||
2433 | |||
2434 | verify_osdc_wrlocked(osdc); | ||
2435 | |||
2436 | lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks, | ||
2437 | lreq->linger_id); | ||
2438 | if (lookup_lreq) { | ||
2439 | WARN_ON(lookup_lreq != lreq); | ||
2440 | return; | ||
2441 | } | ||
2442 | |||
2443 | linger_get(lreq); | ||
2444 | insert_linger_mc(&osdc->linger_map_checks, lreq); | ||
2445 | ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap", | ||
2446 | linger_map_check_cb, lreq->linger_id); | ||
2447 | WARN_ON(ret); | ||
2448 | } | ||
2449 | |||
2450 | static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq) | ||
2451 | { | ||
2452 | int ret; | ||
2453 | |||
2454 | dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); | ||
2455 | ret = wait_for_completion_interruptible(&lreq->reg_commit_wait); | ||
2456 | return ret ?: lreq->reg_commit_error; | ||
2457 | } | ||
2458 | |||
2459 | static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq) | ||
2460 | { | ||
2461 | int ret; | ||
2462 | |||
2463 | dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); | ||
2464 | ret = wait_for_completion_interruptible(&lreq->notify_finish_wait); | ||
2465 | return ret ?: lreq->notify_finish_error; | ||
1604 | } | 2466 | } |
1605 | 2467 | ||
1606 | /* | 2468 | /* |
1607 | * Timeout callback, called every N seconds when 1 or more osd | 2469 | * Timeout callback, called every N seconds. When 1 or more OSD |
1608 | * requests has been active for more than N seconds. When this | 2470 | * requests has been active for more than N seconds, we send a keepalive |
1609 | * happens, we ping all OSDs with requests who have timed out to | 2471 | * (tag + timestamp) to its OSD to ensure any communications channel |
1610 | * ensure any communications channel reset is detected. Reset the | 2472 | * reset is detected. |
1611 | * request timeouts another N seconds in the future as we go. | ||
1612 | * Reschedule the timeout event another N seconds in future (unless | ||
1613 | * there are no open requests). | ||
1614 | */ | 2473 | */ |
1615 | static void handle_timeout(struct work_struct *work) | 2474 | static void handle_timeout(struct work_struct *work) |
1616 | { | 2475 | { |
1617 | struct ceph_osd_client *osdc = | 2476 | struct ceph_osd_client *osdc = |
1618 | container_of(work, struct ceph_osd_client, timeout_work.work); | 2477 | container_of(work, struct ceph_osd_client, timeout_work.work); |
1619 | struct ceph_options *opts = osdc->client->options; | 2478 | struct ceph_options *opts = osdc->client->options; |
1620 | struct ceph_osd_request *req; | 2479 | unsigned long cutoff = jiffies - opts->osd_keepalive_timeout; |
1621 | struct ceph_osd *osd; | 2480 | LIST_HEAD(slow_osds); |
1622 | struct list_head slow_osds; | 2481 | struct rb_node *n, *p; |
1623 | dout("timeout\n"); | ||
1624 | down_read(&osdc->map_sem); | ||
1625 | |||
1626 | ceph_monc_request_next_osdmap(&osdc->client->monc); | ||
1627 | 2482 | ||
1628 | mutex_lock(&osdc->request_mutex); | 2483 | dout("%s osdc %p\n", __func__, osdc); |
2484 | down_write(&osdc->lock); | ||
1629 | 2485 | ||
1630 | /* | 2486 | /* |
1631 | * ping osds that are a bit slow. this ensures that if there | 2487 | * ping osds that are a bit slow. this ensures that if there |
1632 | * is a break in the TCP connection we will notice, and reopen | 2488 | * is a break in the TCP connection we will notice, and reopen |
1633 | * a connection with that osd (from the fault callback). | 2489 | * a connection with that osd (from the fault callback). |
1634 | */ | 2490 | */ |
1635 | INIT_LIST_HEAD(&slow_osds); | 2491 | for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { |
1636 | list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { | 2492 | struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); |
1637 | if (time_before(jiffies, | 2493 | bool found = false; |
1638 | req->r_stamp + opts->osd_keepalive_timeout)) | 2494 | |
1639 | break; | 2495 | for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) { |
2496 | struct ceph_osd_request *req = | ||
2497 | rb_entry(p, struct ceph_osd_request, r_node); | ||
2498 | |||
2499 | if (time_before(req->r_stamp, cutoff)) { | ||
2500 | dout(" req %p tid %llu on osd%d is laggy\n", | ||
2501 | req, req->r_tid, osd->o_osd); | ||
2502 | found = true; | ||
2503 | } | ||
2504 | } | ||
2505 | for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) { | ||
2506 | struct ceph_osd_linger_request *lreq = | ||
2507 | rb_entry(p, struct ceph_osd_linger_request, node); | ||
2508 | |||
2509 | dout(" lreq %p linger_id %llu is served by osd%d\n", | ||
2510 | lreq, lreq->linger_id, osd->o_osd); | ||
2511 | found = true; | ||
2512 | |||
2513 | mutex_lock(&lreq->lock); | ||
2514 | if (lreq->is_watch && lreq->committed && !lreq->last_error) | ||
2515 | send_linger_ping(lreq); | ||
2516 | mutex_unlock(&lreq->lock); | ||
2517 | } | ||
1640 | 2518 | ||
1641 | osd = req->r_osd; | 2519 | if (found) |
1642 | BUG_ON(!osd); | 2520 | list_move_tail(&osd->o_keepalive_item, &slow_osds); |
1643 | dout(" tid %llu is slow, will send keepalive on osd%d\n", | ||
1644 | req->r_tid, osd->o_osd); | ||
1645 | list_move_tail(&osd->o_keepalive_item, &slow_osds); | ||
1646 | } | 2521 | } |
2522 | |||
2523 | if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds)) | ||
2524 | maybe_request_map(osdc); | ||
2525 | |||
1647 | while (!list_empty(&slow_osds)) { | 2526 | while (!list_empty(&slow_osds)) { |
1648 | osd = list_entry(slow_osds.next, struct ceph_osd, | 2527 | struct ceph_osd *osd = list_first_entry(&slow_osds, |
1649 | o_keepalive_item); | 2528 | struct ceph_osd, |
2529 | o_keepalive_item); | ||
1650 | list_del_init(&osd->o_keepalive_item); | 2530 | list_del_init(&osd->o_keepalive_item); |
1651 | ceph_con_keepalive(&osd->o_con); | 2531 | ceph_con_keepalive(&osd->o_con); |
1652 | } | 2532 | } |
1653 | 2533 | ||
1654 | __schedule_osd_timeout(osdc); | 2534 | up_write(&osdc->lock); |
1655 | __send_queued(osdc); | 2535 | schedule_delayed_work(&osdc->timeout_work, |
1656 | mutex_unlock(&osdc->request_mutex); | 2536 | osdc->client->options->osd_keepalive_timeout); |
1657 | up_read(&osdc->map_sem); | ||
1658 | } | 2537 | } |
1659 | 2538 | ||
1660 | static void handle_osds_timeout(struct work_struct *work) | 2539 | static void handle_osds_timeout(struct work_struct *work) |
@@ -1663,12 +2542,20 @@ static void handle_osds_timeout(struct work_struct *work) | |||
1663 | container_of(work, struct ceph_osd_client, | 2542 | container_of(work, struct ceph_osd_client, |
1664 | osds_timeout_work.work); | 2543 | osds_timeout_work.work); |
1665 | unsigned long delay = osdc->client->options->osd_idle_ttl / 4; | 2544 | unsigned long delay = osdc->client->options->osd_idle_ttl / 4; |
2545 | struct ceph_osd *osd, *nosd; | ||
1666 | 2546 | ||
1667 | dout("osds timeout\n"); | 2547 | dout("%s osdc %p\n", __func__, osdc); |
1668 | down_read(&osdc->map_sem); | 2548 | down_write(&osdc->lock); |
1669 | remove_old_osds(osdc); | 2549 | list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { |
1670 | up_read(&osdc->map_sem); | 2550 | if (time_before(jiffies, osd->lru_ttl)) |
2551 | break; | ||
1671 | 2552 | ||
2553 | WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); | ||
2554 | WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests)); | ||
2555 | close_osd(osd); | ||
2556 | } | ||
2557 | |||
2558 | up_write(&osdc->lock); | ||
1672 | schedule_delayed_work(&osdc->osds_timeout_work, | 2559 | schedule_delayed_work(&osdc->osds_timeout_work, |
1673 | round_jiffies_relative(delay)); | 2560 | round_jiffies_relative(delay)); |
1674 | } | 2561 | } |
@@ -1776,107 +2663,76 @@ e_inval: | |||
1776 | goto out; | 2663 | goto out; |
1777 | } | 2664 | } |
1778 | 2665 | ||
1779 | static void complete_request(struct ceph_osd_request *req) | 2666 | struct MOSDOpReply { |
1780 | { | 2667 | struct ceph_pg pgid; |
1781 | complete_all(&req->r_safe_completion); /* fsync waiter */ | 2668 | u64 flags; |
1782 | } | 2669 | int result; |
2670 | u32 epoch; | ||
2671 | int num_ops; | ||
2672 | u32 outdata_len[CEPH_OSD_MAX_OPS]; | ||
2673 | s32 rval[CEPH_OSD_MAX_OPS]; | ||
2674 | int retry_attempt; | ||
2675 | struct ceph_eversion replay_version; | ||
2676 | u64 user_version; | ||
2677 | struct ceph_request_redirect redirect; | ||
2678 | }; | ||
1783 | 2679 | ||
1784 | /* | 2680 | static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m) |
1785 | * handle osd op reply. either call the callback if it is specified, | ||
1786 | * or do the completion to wake up the waiting thread. | ||
1787 | */ | ||
1788 | static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) | ||
1789 | { | 2681 | { |
1790 | void *p, *end; | 2682 | void *p = msg->front.iov_base; |
1791 | struct ceph_osd_request *req; | 2683 | void *const end = p + msg->front.iov_len; |
1792 | struct ceph_request_redirect redir; | 2684 | u16 version = le16_to_cpu(msg->hdr.version); |
1793 | u64 tid; | 2685 | struct ceph_eversion bad_replay_version; |
1794 | int object_len; | ||
1795 | unsigned int numops; | ||
1796 | int payload_len, flags; | ||
1797 | s32 result; | ||
1798 | s32 retry_attempt; | ||
1799 | struct ceph_pg pg; | ||
1800 | int err; | ||
1801 | u32 reassert_epoch; | ||
1802 | u64 reassert_version; | ||
1803 | u32 osdmap_epoch; | ||
1804 | int already_completed; | ||
1805 | u32 bytes; | ||
1806 | u8 decode_redir; | 2686 | u8 decode_redir; |
1807 | unsigned int i; | 2687 | u32 len; |
1808 | 2688 | int ret; | |
1809 | tid = le64_to_cpu(msg->hdr.tid); | 2689 | int i; |
1810 | dout("handle_reply %p tid %llu\n", msg, tid); | ||
1811 | 2690 | ||
1812 | p = msg->front.iov_base; | 2691 | ceph_decode_32_safe(&p, end, len, e_inval); |
1813 | end = p + msg->front.iov_len; | 2692 | ceph_decode_need(&p, end, len, e_inval); |
2693 | p += len; /* skip oid */ | ||
1814 | 2694 | ||
1815 | ceph_decode_need(&p, end, 4, bad); | 2695 | ret = ceph_decode_pgid(&p, end, &m->pgid); |
1816 | object_len = ceph_decode_32(&p); | 2696 | if (ret) |
1817 | ceph_decode_need(&p, end, object_len, bad); | 2697 | return ret; |
1818 | p += object_len; | ||
1819 | 2698 | ||
1820 | err = ceph_decode_pgid(&p, end, &pg); | 2699 | ceph_decode_64_safe(&p, end, m->flags, e_inval); |
1821 | if (err) | 2700 | ceph_decode_32_safe(&p, end, m->result, e_inval); |
1822 | goto bad; | 2701 | ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval); |
2702 | memcpy(&bad_replay_version, p, sizeof(bad_replay_version)); | ||
2703 | p += sizeof(bad_replay_version); | ||
2704 | ceph_decode_32_safe(&p, end, m->epoch, e_inval); | ||
1823 | 2705 | ||
1824 | ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad); | 2706 | ceph_decode_32_safe(&p, end, m->num_ops, e_inval); |
1825 | flags = ceph_decode_64(&p); | 2707 | if (m->num_ops > ARRAY_SIZE(m->outdata_len)) |
1826 | result = ceph_decode_32(&p); | 2708 | goto e_inval; |
1827 | reassert_epoch = ceph_decode_32(&p); | ||
1828 | reassert_version = ceph_decode_64(&p); | ||
1829 | osdmap_epoch = ceph_decode_32(&p); | ||
1830 | |||
1831 | /* lookup */ | ||
1832 | down_read(&osdc->map_sem); | ||
1833 | mutex_lock(&osdc->request_mutex); | ||
1834 | req = __lookup_request(osdc, tid); | ||
1835 | if (req == NULL) { | ||
1836 | dout("handle_reply tid %llu dne\n", tid); | ||
1837 | goto bad_mutex; | ||
1838 | } | ||
1839 | ceph_osdc_get_request(req); | ||
1840 | 2709 | ||
1841 | dout("handle_reply %p tid %llu req %p result %d\n", msg, tid, | 2710 | ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op), |
1842 | req, result); | 2711 | e_inval); |
1843 | 2712 | for (i = 0; i < m->num_ops; i++) { | |
1844 | ceph_decode_need(&p, end, 4, bad_put); | ||
1845 | numops = ceph_decode_32(&p); | ||
1846 | if (numops > CEPH_OSD_MAX_OPS) | ||
1847 | goto bad_put; | ||
1848 | if (numops != req->r_num_ops) | ||
1849 | goto bad_put; | ||
1850 | payload_len = 0; | ||
1851 | ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put); | ||
1852 | for (i = 0; i < numops; i++) { | ||
1853 | struct ceph_osd_op *op = p; | 2713 | struct ceph_osd_op *op = p; |
1854 | int len; | ||
1855 | 2714 | ||
1856 | len = le32_to_cpu(op->payload_len); | 2715 | m->outdata_len[i] = le32_to_cpu(op->payload_len); |
1857 | req->r_ops[i].outdata_len = len; | ||
1858 | dout(" op %d has %d bytes\n", i, len); | ||
1859 | payload_len += len; | ||
1860 | p += sizeof(*op); | 2716 | p += sizeof(*op); |
1861 | } | 2717 | } |
1862 | bytes = le32_to_cpu(msg->hdr.data_len); | ||
1863 | if (payload_len != bytes) { | ||
1864 | pr_warn("sum of op payload lens %d != data_len %d\n", | ||
1865 | payload_len, bytes); | ||
1866 | goto bad_put; | ||
1867 | } | ||
1868 | 2718 | ||
1869 | ceph_decode_need(&p, end, 4 + numops * 4, bad_put); | 2719 | ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval); |
1870 | retry_attempt = ceph_decode_32(&p); | 2720 | for (i = 0; i < m->num_ops; i++) |
1871 | for (i = 0; i < numops; i++) | 2721 | ceph_decode_32_safe(&p, end, m->rval[i], e_inval); |
1872 | req->r_ops[i].rval = ceph_decode_32(&p); | ||
1873 | 2722 | ||
1874 | if (le16_to_cpu(msg->hdr.version) >= 6) { | 2723 | if (version >= 5) { |
1875 | p += 8 + 4; /* skip replay_version */ | 2724 | ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval); |
1876 | p += 8; /* skip user_version */ | 2725 | memcpy(&m->replay_version, p, sizeof(m->replay_version)); |
2726 | p += sizeof(m->replay_version); | ||
2727 | ceph_decode_64_safe(&p, end, m->user_version, e_inval); | ||
2728 | } else { | ||
2729 | m->replay_version = bad_replay_version; /* struct */ | ||
2730 | m->user_version = le64_to_cpu(m->replay_version.version); | ||
2731 | } | ||
1877 | 2732 | ||
1878 | if (le16_to_cpu(msg->hdr.version) >= 7) | 2733 | if (version >= 6) { |
1879 | ceph_decode_8_safe(&p, end, decode_redir, bad_put); | 2734 | if (version >= 7) |
2735 | ceph_decode_8_safe(&p, end, decode_redir, e_inval); | ||
1880 | else | 2736 | else |
1881 | decode_redir = 1; | 2737 | decode_redir = 1; |
1882 | } else { | 2738 | } else { |
@@ -1884,228 +2740,410 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) | |||
1884 | } | 2740 | } |
1885 | 2741 | ||
1886 | if (decode_redir) { | 2742 | if (decode_redir) { |
1887 | err = ceph_redirect_decode(&p, end, &redir); | 2743 | ret = ceph_redirect_decode(&p, end, &m->redirect); |
1888 | if (err) | 2744 | if (ret) |
1889 | goto bad_put; | 2745 | return ret; |
1890 | } else { | 2746 | } else { |
1891 | redir.oloc.pool = -1; | 2747 | ceph_oloc_init(&m->redirect.oloc); |
1892 | } | 2748 | } |
1893 | 2749 | ||
1894 | if (redir.oloc.pool != -1) { | 2750 | return 0; |
1895 | dout("redirect pool %lld\n", redir.oloc.pool); | ||
1896 | |||
1897 | __unregister_request(osdc, req); | ||
1898 | |||
1899 | req->r_target_oloc = redir.oloc; /* struct */ | ||
1900 | 2751 | ||
1901 | /* | 2752 | e_inval: |
1902 | * Start redirect requests with nofail=true. If | 2753 | return -EINVAL; |
1903 | * mapping fails, request will end up on the notarget | 2754 | } |
1904 | * list, waiting for the new osdmap (which can take | ||
1905 | * a while), even though the original request mapped | ||
1906 | * successfully. In the future we might want to follow | ||
1907 | * original request's nofail setting here. | ||
1908 | */ | ||
1909 | err = __ceph_osdc_start_request(osdc, req, true); | ||
1910 | BUG_ON(err); | ||
1911 | 2755 | ||
1912 | goto out_unlock; | 2756 | /* |
1913 | } | 2757 | * We are done with @req if |
2758 | * - @m is a safe reply, or | ||
2759 | * - @m is an unsafe reply and we didn't want a safe one | ||
2760 | */ | ||
2761 | static bool done_request(const struct ceph_osd_request *req, | ||
2762 | const struct MOSDOpReply *m) | ||
2763 | { | ||
2764 | return (m->result < 0 || | ||
2765 | (m->flags & CEPH_OSD_FLAG_ONDISK) || | ||
2766 | !(req->r_flags & CEPH_OSD_FLAG_ONDISK)); | ||
2767 | } | ||
1914 | 2768 | ||
1915 | already_completed = req->r_got_reply; | 2769 | /* |
1916 | if (!req->r_got_reply) { | 2770 | * handle osd op reply. either call the callback if it is specified, |
1917 | req->r_result = result; | 2771 | * or do the completion to wake up the waiting thread. |
1918 | dout("handle_reply result %d bytes %d\n", req->r_result, | 2772 | * |
1919 | bytes); | 2773 | * ->r_unsafe_callback is set? yes no |
1920 | if (req->r_result == 0) | 2774 | * |
1921 | req->r_result = bytes; | 2775 | * first reply is OK (needed r_cb/r_completion, r_cb/r_completion, |
2776 | * any or needed/got safe) r_safe_completion r_safe_completion | ||
2777 | * | ||
2778 | * first reply is unsafe r_unsafe_cb(true) (nothing) | ||
2779 | * | ||
2780 | * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion, | ||
2781 | * r_safe_completion r_safe_completion | ||
2782 | */ | ||
2783 | static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) | ||
2784 | { | ||
2785 | struct ceph_osd_client *osdc = osd->o_osdc; | ||
2786 | struct ceph_osd_request *req; | ||
2787 | struct MOSDOpReply m; | ||
2788 | u64 tid = le64_to_cpu(msg->hdr.tid); | ||
2789 | u32 data_len = 0; | ||
2790 | bool already_acked; | ||
2791 | int ret; | ||
2792 | int i; | ||
1922 | 2793 | ||
1923 | /* in case this is a write and we need to replay, */ | 2794 | dout("%s msg %p tid %llu\n", __func__, msg, tid); |
1924 | req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch); | ||
1925 | req->r_reassert_version.version = cpu_to_le64(reassert_version); | ||
1926 | 2795 | ||
1927 | req->r_got_reply = 1; | 2796 | down_read(&osdc->lock); |
1928 | } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { | 2797 | if (!osd_registered(osd)) { |
1929 | dout("handle_reply tid %llu dup ack\n", tid); | 2798 | dout("%s osd%d unknown\n", __func__, osd->o_osd); |
1930 | goto out_unlock; | 2799 | goto out_unlock_osdc; |
1931 | } | 2800 | } |
2801 | WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num)); | ||
1932 | 2802 | ||
1933 | dout("handle_reply tid %llu flags %d\n", tid, flags); | 2803 | mutex_lock(&osd->lock); |
2804 | req = lookup_request(&osd->o_requests, tid); | ||
2805 | if (!req) { | ||
2806 | dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid); | ||
2807 | goto out_unlock_session; | ||
2808 | } | ||
1934 | 2809 | ||
1935 | if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) | 2810 | ret = decode_MOSDOpReply(msg, &m); |
1936 | __register_linger_request(osdc, req); | 2811 | if (ret) { |
2812 | pr_err("failed to decode MOSDOpReply for tid %llu: %d\n", | ||
2813 | req->r_tid, ret); | ||
2814 | ceph_msg_dump(msg); | ||
2815 | goto fail_request; | ||
2816 | } | ||
2817 | dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n", | ||
2818 | __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed, | ||
2819 | m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch), | ||
2820 | le64_to_cpu(m.replay_version.version), m.user_version); | ||
2821 | |||
2822 | if (m.retry_attempt >= 0) { | ||
2823 | if (m.retry_attempt != req->r_attempts - 1) { | ||
2824 | dout("req %p tid %llu retry_attempt %d != %d, ignoring\n", | ||
2825 | req, req->r_tid, m.retry_attempt, | ||
2826 | req->r_attempts - 1); | ||
2827 | goto out_unlock_session; | ||
2828 | } | ||
2829 | } else { | ||
2830 | WARN_ON(1); /* MOSDOpReply v4 is assumed */ | ||
2831 | } | ||
1937 | 2832 | ||
1938 | /* either this is a read, or we got the safe response */ | 2833 | if (!ceph_oloc_empty(&m.redirect.oloc)) { |
1939 | if (result < 0 || | 2834 | dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid, |
1940 | (flags & CEPH_OSD_FLAG_ONDISK) || | 2835 | m.redirect.oloc.pool); |
1941 | ((flags & CEPH_OSD_FLAG_WRITE) == 0)) | 2836 | unlink_request(osd, req); |
1942 | __unregister_request(osdc, req); | 2837 | mutex_unlock(&osd->lock); |
2838 | |||
2839 | ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc); | ||
2840 | req->r_flags |= CEPH_OSD_FLAG_REDIRECTED; | ||
2841 | req->r_tid = 0; | ||
2842 | __submit_request(req, false); | ||
2843 | goto out_unlock_osdc; | ||
2844 | } | ||
1943 | 2845 | ||
1944 | mutex_unlock(&osdc->request_mutex); | 2846 | if (m.num_ops != req->r_num_ops) { |
1945 | up_read(&osdc->map_sem); | 2847 | pr_err("num_ops %d != %d for tid %llu\n", m.num_ops, |
2848 | req->r_num_ops, req->r_tid); | ||
2849 | goto fail_request; | ||
2850 | } | ||
2851 | for (i = 0; i < req->r_num_ops; i++) { | ||
2852 | dout(" req %p tid %llu op %d rval %d len %u\n", req, | ||
2853 | req->r_tid, i, m.rval[i], m.outdata_len[i]); | ||
2854 | req->r_ops[i].rval = m.rval[i]; | ||
2855 | req->r_ops[i].outdata_len = m.outdata_len[i]; | ||
2856 | data_len += m.outdata_len[i]; | ||
2857 | } | ||
2858 | if (data_len != le32_to_cpu(msg->hdr.data_len)) { | ||
2859 | pr_err("sum of lens %u != %u for tid %llu\n", data_len, | ||
2860 | le32_to_cpu(msg->hdr.data_len), req->r_tid); | ||
2861 | goto fail_request; | ||
2862 | } | ||
2863 | dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__, | ||
2864 | req, req->r_tid, req->r_got_reply, m.result, data_len); | ||
2865 | |||
2866 | already_acked = req->r_got_reply; | ||
2867 | if (!already_acked) { | ||
2868 | req->r_result = m.result ?: data_len; | ||
2869 | req->r_replay_version = m.replay_version; /* struct */ | ||
2870 | req->r_got_reply = true; | ||
2871 | } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) { | ||
2872 | dout("req %p tid %llu dup ack\n", req, req->r_tid); | ||
2873 | goto out_unlock_session; | ||
2874 | } | ||
1946 | 2875 | ||
1947 | if (!already_completed) { | 2876 | if (done_request(req, &m)) { |
1948 | if (req->r_unsafe_callback && | 2877 | __finish_request(req); |
1949 | result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK)) | 2878 | if (req->r_linger) { |
1950 | req->r_unsafe_callback(req, true); | 2879 | WARN_ON(req->r_unsafe_callback); |
1951 | if (req->r_callback) | 2880 | dout("req %p tid %llu cb (locked)\n", req, req->r_tid); |
1952 | req->r_callback(req, msg); | 2881 | __complete_request(req); |
1953 | else | 2882 | } |
1954 | complete_all(&req->r_completion); | ||
1955 | } | 2883 | } |
1956 | 2884 | ||
1957 | if (flags & CEPH_OSD_FLAG_ONDISK) { | 2885 | mutex_unlock(&osd->lock); |
1958 | if (req->r_unsafe_callback && already_completed) | 2886 | up_read(&osdc->lock); |
2887 | |||
2888 | if (done_request(req, &m)) { | ||
2889 | if (already_acked && req->r_unsafe_callback) { | ||
2890 | dout("req %p tid %llu safe-cb\n", req, req->r_tid); | ||
1959 | req->r_unsafe_callback(req, false); | 2891 | req->r_unsafe_callback(req, false); |
1960 | complete_request(req); | 2892 | } else if (!req->r_linger) { |
2893 | dout("req %p tid %llu cb\n", req, req->r_tid); | ||
2894 | __complete_request(req); | ||
2895 | } | ||
2896 | } else { | ||
2897 | if (req->r_unsafe_callback) { | ||
2898 | dout("req %p tid %llu unsafe-cb\n", req, req->r_tid); | ||
2899 | req->r_unsafe_callback(req, true); | ||
2900 | } else { | ||
2901 | WARN_ON(1); | ||
2902 | } | ||
1961 | } | 2903 | } |
2904 | if (m.flags & CEPH_OSD_FLAG_ONDISK) | ||
2905 | complete_all(&req->r_safe_completion); | ||
1962 | 2906 | ||
1963 | out: | ||
1964 | dout("req=%p req->r_linger=%d\n", req, req->r_linger); | ||
1965 | ceph_osdc_put_request(req); | 2907 | ceph_osdc_put_request(req); |
1966 | return; | 2908 | return; |
1967 | out_unlock: | ||
1968 | mutex_unlock(&osdc->request_mutex); | ||
1969 | up_read(&osdc->map_sem); | ||
1970 | goto out; | ||
1971 | 2909 | ||
1972 | bad_put: | 2910 | fail_request: |
1973 | req->r_result = -EIO; | 2911 | complete_request(req, -EIO); |
1974 | __unregister_request(osdc, req); | 2912 | out_unlock_session: |
1975 | if (req->r_callback) | 2913 | mutex_unlock(&osd->lock); |
1976 | req->r_callback(req, msg); | 2914 | out_unlock_osdc: |
1977 | else | 2915 | up_read(&osdc->lock); |
1978 | complete_all(&req->r_completion); | ||
1979 | complete_request(req); | ||
1980 | ceph_osdc_put_request(req); | ||
1981 | bad_mutex: | ||
1982 | mutex_unlock(&osdc->request_mutex); | ||
1983 | up_read(&osdc->map_sem); | ||
1984 | bad: | ||
1985 | pr_err("corrupt osd_op_reply got %d %d\n", | ||
1986 | (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); | ||
1987 | ceph_msg_dump(msg); | ||
1988 | } | 2916 | } |
1989 | 2917 | ||
1990 | static void reset_changed_osds(struct ceph_osd_client *osdc) | 2918 | static void set_pool_was_full(struct ceph_osd_client *osdc) |
1991 | { | 2919 | { |
1992 | struct rb_node *p, *n; | 2920 | struct rb_node *n; |
1993 | 2921 | ||
1994 | dout("%s %p\n", __func__, osdc); | 2922 | for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) { |
1995 | for (p = rb_first(&osdc->osds); p; p = n) { | 2923 | struct ceph_pg_pool_info *pi = |
1996 | struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); | 2924 | rb_entry(n, struct ceph_pg_pool_info, node); |
1997 | 2925 | ||
1998 | n = rb_next(p); | 2926 | pi->was_full = __pool_full(pi); |
1999 | if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || | ||
2000 | memcmp(&osd->o_con.peer_addr, | ||
2001 | ceph_osd_addr(osdc->osdmap, | ||
2002 | osd->o_osd), | ||
2003 | sizeof(struct ceph_entity_addr)) != 0) | ||
2004 | __reset_osd(osdc, osd); | ||
2005 | } | 2927 | } |
2006 | } | 2928 | } |
2007 | 2929 | ||
2930 | static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id) | ||
2931 | { | ||
2932 | struct ceph_pg_pool_info *pi; | ||
2933 | |||
2934 | pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id); | ||
2935 | if (!pi) | ||
2936 | return false; | ||
2937 | |||
2938 | return pi->was_full && !__pool_full(pi); | ||
2939 | } | ||
2940 | |||
2941 | static enum calc_target_result | ||
2942 | recalc_linger_target(struct ceph_osd_linger_request *lreq) | ||
2943 | { | ||
2944 | struct ceph_osd_client *osdc = lreq->osdc; | ||
2945 | enum calc_target_result ct_res; | ||
2946 | |||
2947 | ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true); | ||
2948 | if (ct_res == CALC_TARGET_NEED_RESEND) { | ||
2949 | struct ceph_osd *osd; | ||
2950 | |||
2951 | osd = lookup_create_osd(osdc, lreq->t.osd, true); | ||
2952 | if (osd != lreq->osd) { | ||
2953 | unlink_linger(lreq->osd, lreq); | ||
2954 | link_linger(osd, lreq); | ||
2955 | } | ||
2956 | } | ||
2957 | |||
2958 | return ct_res; | ||
2959 | } | ||
2960 | |||
2008 | /* | 2961 | /* |
2009 | * Requeue requests whose mapping to an OSD has changed. If requests map to | 2962 | * Requeue requests whose mapping to an OSD has changed. |
2010 | * no osd, request a new map. | ||
2011 | * | ||
2012 | * Caller should hold map_sem for read. | ||
2013 | */ | 2963 | */ |
2014 | static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, | 2964 | static void scan_requests(struct ceph_osd *osd, |
2015 | bool force_resend_writes) | 2965 | bool force_resend, |
2966 | bool cleared_full, | ||
2967 | bool check_pool_cleared_full, | ||
2968 | struct rb_root *need_resend, | ||
2969 | struct list_head *need_resend_linger) | ||
2016 | { | 2970 | { |
2017 | struct ceph_osd_request *req, *nreq; | 2971 | struct ceph_osd_client *osdc = osd->o_osdc; |
2018 | struct rb_node *p; | 2972 | struct rb_node *n; |
2019 | int needmap = 0; | 2973 | bool force_resend_writes; |
2020 | int err; | 2974 | |
2021 | bool force_resend_req; | 2975 | for (n = rb_first(&osd->o_linger_requests); n; ) { |
2976 | struct ceph_osd_linger_request *lreq = | ||
2977 | rb_entry(n, struct ceph_osd_linger_request, node); | ||
2978 | enum calc_target_result ct_res; | ||
2979 | |||
2980 | n = rb_next(n); /* recalc_linger_target() */ | ||
2981 | |||
2982 | dout("%s lreq %p linger_id %llu\n", __func__, lreq, | ||
2983 | lreq->linger_id); | ||
2984 | ct_res = recalc_linger_target(lreq); | ||
2985 | switch (ct_res) { | ||
2986 | case CALC_TARGET_NO_ACTION: | ||
2987 | force_resend_writes = cleared_full || | ||
2988 | (check_pool_cleared_full && | ||
2989 | pool_cleared_full(osdc, lreq->t.base_oloc.pool)); | ||
2990 | if (!force_resend && !force_resend_writes) | ||
2991 | break; | ||
2992 | |||
2993 | /* fall through */ | ||
2994 | case CALC_TARGET_NEED_RESEND: | ||
2995 | cancel_linger_map_check(lreq); | ||
2996 | /* | ||
2997 | * scan_requests() for the previous epoch(s) | ||
2998 | * may have already added it to the list, since | ||
2999 | * it's not unlinked here. | ||
3000 | */ | ||
3001 | if (list_empty(&lreq->scan_item)) | ||
3002 | list_add_tail(&lreq->scan_item, need_resend_linger); | ||
3003 | break; | ||
3004 | case CALC_TARGET_POOL_DNE: | ||
3005 | check_linger_pool_dne(lreq); | ||
3006 | break; | ||
3007 | } | ||
3008 | } | ||
2022 | 3009 | ||
2023 | dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "", | 3010 | for (n = rb_first(&osd->o_requests); n; ) { |
2024 | force_resend_writes ? " (force resend writes)" : ""); | 3011 | struct ceph_osd_request *req = |
2025 | mutex_lock(&osdc->request_mutex); | 3012 | rb_entry(n, struct ceph_osd_request, r_node); |
2026 | for (p = rb_first(&osdc->requests); p; ) { | 3013 | enum calc_target_result ct_res; |
2027 | req = rb_entry(p, struct ceph_osd_request, r_node); | 3014 | |
2028 | p = rb_next(p); | 3015 | n = rb_next(n); /* unlink_request(), check_pool_dne() */ |
3016 | |||
3017 | dout("%s req %p tid %llu\n", __func__, req, req->r_tid); | ||
3018 | ct_res = calc_target(osdc, &req->r_t, | ||
3019 | &req->r_last_force_resend, false); | ||
3020 | switch (ct_res) { | ||
3021 | case CALC_TARGET_NO_ACTION: | ||
3022 | force_resend_writes = cleared_full || | ||
3023 | (check_pool_cleared_full && | ||
3024 | pool_cleared_full(osdc, req->r_t.base_oloc.pool)); | ||
3025 | if (!force_resend && | ||
3026 | (!(req->r_flags & CEPH_OSD_FLAG_WRITE) || | ||
3027 | !force_resend_writes)) | ||
3028 | break; | ||
3029 | |||
3030 | /* fall through */ | ||
3031 | case CALC_TARGET_NEED_RESEND: | ||
3032 | cancel_map_check(req); | ||
3033 | unlink_request(osd, req); | ||
3034 | insert_request(need_resend, req); | ||
3035 | break; | ||
3036 | case CALC_TARGET_POOL_DNE: | ||
3037 | check_pool_dne(req); | ||
3038 | break; | ||
3039 | } | ||
3040 | } | ||
3041 | } | ||
2029 | 3042 | ||
3043 | static int handle_one_map(struct ceph_osd_client *osdc, | ||
3044 | void *p, void *end, bool incremental, | ||
3045 | struct rb_root *need_resend, | ||
3046 | struct list_head *need_resend_linger) | ||
3047 | { | ||
3048 | struct ceph_osdmap *newmap; | ||
3049 | struct rb_node *n; | ||
3050 | bool skipped_map = false; | ||
3051 | bool was_full; | ||
3052 | |||
3053 | was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); | ||
3054 | set_pool_was_full(osdc); | ||
3055 | |||
3056 | if (incremental) | ||
3057 | newmap = osdmap_apply_incremental(&p, end, osdc->osdmap); | ||
3058 | else | ||
3059 | newmap = ceph_osdmap_decode(&p, end); | ||
3060 | if (IS_ERR(newmap)) | ||
3061 | return PTR_ERR(newmap); | ||
3062 | |||
3063 | if (newmap != osdc->osdmap) { | ||
2030 | /* | 3064 | /* |
2031 | * For linger requests that have not yet been | 3065 | * Preserve ->was_full before destroying the old map. |
2032 | * registered, move them to the linger list; they'll | 3066 | * For pools that weren't in the old map, ->was_full |
2033 | * be sent to the osd in the loop below. Unregister | 3067 | * should be false. |
2034 | * the request before re-registering it as a linger | ||
2035 | * request to ensure the __map_request() below | ||
2036 | * will decide it needs to be sent. | ||
2037 | */ | 3068 | */ |
2038 | if (req->r_linger && list_empty(&req->r_linger_item)) { | 3069 | for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) { |
2039 | dout("%p tid %llu restart on osd%d\n", | 3070 | struct ceph_pg_pool_info *pi = |
2040 | req, req->r_tid, | 3071 | rb_entry(n, struct ceph_pg_pool_info, node); |
2041 | req->r_osd ? req->r_osd->o_osd : -1); | 3072 | struct ceph_pg_pool_info *old_pi; |
2042 | ceph_osdc_get_request(req); | 3073 | |
2043 | __unregister_request(osdc, req); | 3074 | old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id); |
2044 | __register_linger_request(osdc, req); | 3075 | if (old_pi) |
2045 | ceph_osdc_put_request(req); | 3076 | pi->was_full = old_pi->was_full; |
2046 | continue; | 3077 | else |
3078 | WARN_ON(pi->was_full); | ||
2047 | } | 3079 | } |
2048 | 3080 | ||
2049 | force_resend_req = force_resend || | 3081 | if (osdc->osdmap->epoch && |
2050 | (force_resend_writes && | 3082 | osdc->osdmap->epoch + 1 < newmap->epoch) { |
2051 | req->r_flags & CEPH_OSD_FLAG_WRITE); | 3083 | WARN_ON(incremental); |
2052 | err = __map_request(osdc, req, force_resend_req); | 3084 | skipped_map = true; |
2053 | if (err < 0) | ||
2054 | continue; /* error */ | ||
2055 | if (req->r_osd == NULL) { | ||
2056 | dout("%p tid %llu maps to no osd\n", req, req->r_tid); | ||
2057 | needmap++; /* request a newer map */ | ||
2058 | } else if (err > 0) { | ||
2059 | if (!req->r_linger) { | ||
2060 | dout("%p tid %llu requeued on osd%d\n", req, | ||
2061 | req->r_tid, | ||
2062 | req->r_osd ? req->r_osd->o_osd : -1); | ||
2063 | req->r_flags |= CEPH_OSD_FLAG_RETRY; | ||
2064 | } | ||
2065 | } | 3085 | } |
3086 | |||
3087 | ceph_osdmap_destroy(osdc->osdmap); | ||
3088 | osdc->osdmap = newmap; | ||
2066 | } | 3089 | } |
2067 | 3090 | ||
2068 | list_for_each_entry_safe(req, nreq, &osdc->req_linger, | 3091 | was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); |
2069 | r_linger_item) { | 3092 | scan_requests(&osdc->homeless_osd, skipped_map, was_full, true, |
2070 | dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); | 3093 | need_resend, need_resend_linger); |
2071 | 3094 | ||
2072 | err = __map_request(osdc, req, | 3095 | for (n = rb_first(&osdc->osds); n; ) { |
2073 | force_resend || force_resend_writes); | 3096 | struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); |
2074 | dout("__map_request returned %d\n", err); | 3097 | |
2075 | if (err < 0) | 3098 | n = rb_next(n); /* close_osd() */ |
2076 | continue; /* hrm! */ | 3099 | |
2077 | if (req->r_osd == NULL || err > 0) { | 3100 | scan_requests(osd, skipped_map, was_full, true, need_resend, |
2078 | if (req->r_osd == NULL) { | 3101 | need_resend_linger); |
2079 | dout("lingering %p tid %llu maps to no osd\n", | 3102 | if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || |
2080 | req, req->r_tid); | 3103 | memcmp(&osd->o_con.peer_addr, |
2081 | /* | 3104 | ceph_osd_addr(osdc->osdmap, osd->o_osd), |
2082 | * A homeless lingering request makes | 3105 | sizeof(struct ceph_entity_addr))) |
2083 | * no sense, as it's job is to keep | 3106 | close_osd(osd); |
2084 | * a particular OSD connection open. | 3107 | } |
2085 | * Request a newer map and kick the | ||
2086 | * request, knowing that it won't be | ||
2087 | * resent until we actually get a map | ||
2088 | * that can tell us where to send it. | ||
2089 | */ | ||
2090 | needmap++; | ||
2091 | } | ||
2092 | 3108 | ||
2093 | dout("kicking lingering %p tid %llu osd%d\n", req, | 3109 | return 0; |
2094 | req->r_tid, req->r_osd ? req->r_osd->o_osd : -1); | 3110 | } |
2095 | __register_request(osdc, req); | 3111 | |
2096 | __unregister_linger_request(osdc, req); | 3112 | static void kick_requests(struct ceph_osd_client *osdc, |
3113 | struct rb_root *need_resend, | ||
3114 | struct list_head *need_resend_linger) | ||
3115 | { | ||
3116 | struct ceph_osd_linger_request *lreq, *nlreq; | ||
3117 | struct rb_node *n; | ||
3118 | |||
3119 | for (n = rb_first(need_resend); n; ) { | ||
3120 | struct ceph_osd_request *req = | ||
3121 | rb_entry(n, struct ceph_osd_request, r_node); | ||
3122 | struct ceph_osd *osd; | ||
3123 | |||
3124 | n = rb_next(n); | ||
3125 | erase_request(need_resend, req); /* before link_request() */ | ||
3126 | |||
3127 | WARN_ON(req->r_osd); | ||
3128 | calc_target(osdc, &req->r_t, NULL, false); | ||
3129 | osd = lookup_create_osd(osdc, req->r_t.osd, true); | ||
3130 | link_request(osd, req); | ||
3131 | if (!req->r_linger) { | ||
3132 | if (!osd_homeless(osd) && !req->r_t.paused) | ||
3133 | send_request(req); | ||
3134 | } else { | ||
3135 | cancel_linger_request(req); | ||
2097 | } | 3136 | } |
2098 | } | 3137 | } |
2099 | reset_changed_osds(osdc); | ||
2100 | mutex_unlock(&osdc->request_mutex); | ||
2101 | 3138 | ||
2102 | if (needmap) { | 3139 | list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) { |
2103 | dout("%d requests for down osds, need new map\n", needmap); | 3140 | if (!osd_homeless(lreq->osd)) |
2104 | ceph_monc_request_next_osdmap(&osdc->client->monc); | 3141 | send_linger(lreq); |
3142 | |||
3143 | list_del_init(&lreq->scan_item); | ||
2105 | } | 3144 | } |
2106 | } | 3145 | } |
2107 | 3146 | ||
2108 | |||
2109 | /* | 3147 | /* |
2110 | * Process updated osd map. | 3148 | * Process updated osd map. |
2111 | * | 3149 | * |
@@ -2115,27 +3153,31 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, | |||
2115 | */ | 3153 | */ |
2116 | void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) | 3154 | void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) |
2117 | { | 3155 | { |
2118 | void *p, *end, *next; | 3156 | void *p = msg->front.iov_base; |
3157 | void *const end = p + msg->front.iov_len; | ||
2119 | u32 nr_maps, maplen; | 3158 | u32 nr_maps, maplen; |
2120 | u32 epoch; | 3159 | u32 epoch; |
2121 | struct ceph_osdmap *newmap = NULL, *oldmap; | ||
2122 | int err; | ||
2123 | struct ceph_fsid fsid; | 3160 | struct ceph_fsid fsid; |
2124 | bool was_full; | 3161 | struct rb_root need_resend = RB_ROOT; |
3162 | LIST_HEAD(need_resend_linger); | ||
3163 | bool handled_incremental = false; | ||
3164 | bool was_pauserd, was_pausewr; | ||
3165 | bool pauserd, pausewr; | ||
3166 | int err; | ||
2125 | 3167 | ||
2126 | dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); | 3168 | dout("%s have %u\n", __func__, osdc->osdmap->epoch); |
2127 | p = msg->front.iov_base; | 3169 | down_write(&osdc->lock); |
2128 | end = p + msg->front.iov_len; | ||
2129 | 3170 | ||
2130 | /* verify fsid */ | 3171 | /* verify fsid */ |
2131 | ceph_decode_need(&p, end, sizeof(fsid), bad); | 3172 | ceph_decode_need(&p, end, sizeof(fsid), bad); |
2132 | ceph_decode_copy(&p, &fsid, sizeof(fsid)); | 3173 | ceph_decode_copy(&p, &fsid, sizeof(fsid)); |
2133 | if (ceph_check_fsid(osdc->client, &fsid) < 0) | 3174 | if (ceph_check_fsid(osdc->client, &fsid) < 0) |
2134 | return; | 3175 | goto bad; |
2135 | 3176 | ||
2136 | down_write(&osdc->map_sem); | 3177 | was_pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); |
2137 | 3178 | was_pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || | |
2138 | was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); | 3179 | ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || |
3180 | have_pool_full(osdc); | ||
2139 | 3181 | ||
2140 | /* incremental maps */ | 3182 | /* incremental maps */ |
2141 | ceph_decode_32_safe(&p, end, nr_maps, bad); | 3183 | ceph_decode_32_safe(&p, end, nr_maps, bad); |
@@ -2145,34 +3187,23 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) | |||
2145 | epoch = ceph_decode_32(&p); | 3187 | epoch = ceph_decode_32(&p); |
2146 | maplen = ceph_decode_32(&p); | 3188 | maplen = ceph_decode_32(&p); |
2147 | ceph_decode_need(&p, end, maplen, bad); | 3189 | ceph_decode_need(&p, end, maplen, bad); |
2148 | next = p + maplen; | 3190 | if (osdc->osdmap->epoch && |
2149 | if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) { | 3191 | osdc->osdmap->epoch + 1 == epoch) { |
2150 | dout("applying incremental map %u len %d\n", | 3192 | dout("applying incremental map %u len %d\n", |
2151 | epoch, maplen); | 3193 | epoch, maplen); |
2152 | newmap = osdmap_apply_incremental(&p, next, | 3194 | err = handle_one_map(osdc, p, p + maplen, true, |
2153 | osdc->osdmap, | 3195 | &need_resend, &need_resend_linger); |
2154 | &osdc->client->msgr); | 3196 | if (err) |
2155 | if (IS_ERR(newmap)) { | ||
2156 | err = PTR_ERR(newmap); | ||
2157 | goto bad; | 3197 | goto bad; |
2158 | } | 3198 | handled_incremental = true; |
2159 | BUG_ON(!newmap); | ||
2160 | if (newmap != osdc->osdmap) { | ||
2161 | ceph_osdmap_destroy(osdc->osdmap); | ||
2162 | osdc->osdmap = newmap; | ||
2163 | } | ||
2164 | was_full = was_full || | ||
2165 | ceph_osdmap_flag(osdc->osdmap, | ||
2166 | CEPH_OSDMAP_FULL); | ||
2167 | kick_requests(osdc, 0, was_full); | ||
2168 | } else { | 3199 | } else { |
2169 | dout("ignoring incremental map %u len %d\n", | 3200 | dout("ignoring incremental map %u len %d\n", |
2170 | epoch, maplen); | 3201 | epoch, maplen); |
2171 | } | 3202 | } |
2172 | p = next; | 3203 | p += maplen; |
2173 | nr_maps--; | 3204 | nr_maps--; |
2174 | } | 3205 | } |
2175 | if (newmap) | 3206 | if (handled_incremental) |
2176 | goto done; | 3207 | goto done; |
2177 | 3208 | ||
2178 | /* full maps */ | 3209 | /* full maps */ |
@@ -2186,455 +3217,647 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) | |||
2186 | if (nr_maps > 1) { | 3217 | if (nr_maps > 1) { |
2187 | dout("skipping non-latest full map %u len %d\n", | 3218 | dout("skipping non-latest full map %u len %d\n", |
2188 | epoch, maplen); | 3219 | epoch, maplen); |
2189 | } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) { | 3220 | } else if (osdc->osdmap->epoch >= epoch) { |
2190 | dout("skipping full map %u len %d, " | 3221 | dout("skipping full map %u len %d, " |
2191 | "older than our %u\n", epoch, maplen, | 3222 | "older than our %u\n", epoch, maplen, |
2192 | osdc->osdmap->epoch); | 3223 | osdc->osdmap->epoch); |
2193 | } else { | 3224 | } else { |
2194 | int skipped_map = 0; | ||
2195 | |||
2196 | dout("taking full map %u len %d\n", epoch, maplen); | 3225 | dout("taking full map %u len %d\n", epoch, maplen); |
2197 | newmap = ceph_osdmap_decode(&p, p+maplen); | 3226 | err = handle_one_map(osdc, p, p + maplen, false, |
2198 | if (IS_ERR(newmap)) { | 3227 | &need_resend, &need_resend_linger); |
2199 | err = PTR_ERR(newmap); | 3228 | if (err) |
2200 | goto bad; | 3229 | goto bad; |
2201 | } | ||
2202 | BUG_ON(!newmap); | ||
2203 | oldmap = osdc->osdmap; | ||
2204 | osdc->osdmap = newmap; | ||
2205 | if (oldmap) { | ||
2206 | if (oldmap->epoch + 1 < newmap->epoch) | ||
2207 | skipped_map = 1; | ||
2208 | ceph_osdmap_destroy(oldmap); | ||
2209 | } | ||
2210 | was_full = was_full || | ||
2211 | ceph_osdmap_flag(osdc->osdmap, | ||
2212 | CEPH_OSDMAP_FULL); | ||
2213 | kick_requests(osdc, skipped_map, was_full); | ||
2214 | } | 3230 | } |
2215 | p += maplen; | 3231 | p += maplen; |
2216 | nr_maps--; | 3232 | nr_maps--; |
2217 | } | 3233 | } |
2218 | 3234 | ||
2219 | if (!osdc->osdmap) | ||
2220 | goto bad; | ||
2221 | done: | 3235 | done: |
2222 | downgrade_write(&osdc->map_sem); | ||
2223 | ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, | ||
2224 | osdc->osdmap->epoch); | ||
2225 | |||
2226 | /* | 3236 | /* |
2227 | * subscribe to subsequent osdmap updates if full to ensure | 3237 | * subscribe to subsequent osdmap updates if full to ensure |
2228 | * we find out when we are no longer full and stop returning | 3238 | * we find out when we are no longer full and stop returning |
2229 | * ENOSPC. | 3239 | * ENOSPC. |
2230 | */ | 3240 | */ |
2231 | if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || | 3241 | pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); |
2232 | ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || | 3242 | pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || |
2233 | ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) | 3243 | ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || |
2234 | ceph_monc_request_next_osdmap(&osdc->client->monc); | 3244 | have_pool_full(osdc); |
2235 | 3245 | if (was_pauserd || was_pausewr || pauserd || pausewr) | |
2236 | mutex_lock(&osdc->request_mutex); | 3246 | maybe_request_map(osdc); |
2237 | __send_queued(osdc); | 3247 | |
2238 | mutex_unlock(&osdc->request_mutex); | 3248 | kick_requests(osdc, &need_resend, &need_resend_linger); |
2239 | up_read(&osdc->map_sem); | 3249 | |
3250 | ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, | ||
3251 | osdc->osdmap->epoch); | ||
3252 | up_write(&osdc->lock); | ||
2240 | wake_up_all(&osdc->client->auth_wq); | 3253 | wake_up_all(&osdc->client->auth_wq); |
2241 | return; | 3254 | return; |
2242 | 3255 | ||
2243 | bad: | 3256 | bad: |
2244 | pr_err("osdc handle_map corrupt msg\n"); | 3257 | pr_err("osdc handle_map corrupt msg\n"); |
2245 | ceph_msg_dump(msg); | 3258 | ceph_msg_dump(msg); |
2246 | up_write(&osdc->map_sem); | 3259 | up_write(&osdc->lock); |
2247 | } | 3260 | } |
2248 | 3261 | ||
2249 | /* | 3262 | /* |
2250 | * watch/notify callback event infrastructure | 3263 | * Resubmit requests pending on the given osd. |
2251 | * | ||
2252 | * These callbacks are used both for watch and notify operations. | ||
2253 | */ | 3264 | */ |
2254 | static void __release_event(struct kref *kref) | 3265 | static void kick_osd_requests(struct ceph_osd *osd) |
2255 | { | 3266 | { |
2256 | struct ceph_osd_event *event = | 3267 | struct rb_node *n; |
2257 | container_of(kref, struct ceph_osd_event, kref); | ||
2258 | 3268 | ||
2259 | dout("__release_event %p\n", event); | 3269 | for (n = rb_first(&osd->o_requests); n; ) { |
2260 | kfree(event); | 3270 | struct ceph_osd_request *req = |
2261 | } | 3271 | rb_entry(n, struct ceph_osd_request, r_node); |
2262 | 3272 | ||
2263 | static void get_event(struct ceph_osd_event *event) | 3273 | n = rb_next(n); /* cancel_linger_request() */ |
2264 | { | ||
2265 | kref_get(&event->kref); | ||
2266 | } | ||
2267 | 3274 | ||
2268 | void ceph_osdc_put_event(struct ceph_osd_event *event) | 3275 | if (!req->r_linger) { |
2269 | { | 3276 | if (!req->r_t.paused) |
2270 | kref_put(&event->kref, __release_event); | 3277 | send_request(req); |
3278 | } else { | ||
3279 | cancel_linger_request(req); | ||
3280 | } | ||
3281 | } | ||
3282 | for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) { | ||
3283 | struct ceph_osd_linger_request *lreq = | ||
3284 | rb_entry(n, struct ceph_osd_linger_request, node); | ||
3285 | |||
3286 | send_linger(lreq); | ||
3287 | } | ||
2271 | } | 3288 | } |
2272 | EXPORT_SYMBOL(ceph_osdc_put_event); | ||
2273 | 3289 | ||
2274 | static void __insert_event(struct ceph_osd_client *osdc, | 3290 | /* |
2275 | struct ceph_osd_event *new) | 3291 | * If the osd connection drops, we need to resubmit all requests. |
3292 | */ | ||
3293 | static void osd_fault(struct ceph_connection *con) | ||
2276 | { | 3294 | { |
2277 | struct rb_node **p = &osdc->event_tree.rb_node; | 3295 | struct ceph_osd *osd = con->private; |
2278 | struct rb_node *parent = NULL; | 3296 | struct ceph_osd_client *osdc = osd->o_osdc; |
2279 | struct ceph_osd_event *event = NULL; | ||
2280 | 3297 | ||
2281 | while (*p) { | 3298 | dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); |
2282 | parent = *p; | 3299 | |
2283 | event = rb_entry(parent, struct ceph_osd_event, node); | 3300 | down_write(&osdc->lock); |
2284 | if (new->cookie < event->cookie) | 3301 | if (!osd_registered(osd)) { |
2285 | p = &(*p)->rb_left; | 3302 | dout("%s osd%d unknown\n", __func__, osd->o_osd); |
2286 | else if (new->cookie > event->cookie) | 3303 | goto out_unlock; |
2287 | p = &(*p)->rb_right; | ||
2288 | else | ||
2289 | BUG(); | ||
2290 | } | 3304 | } |
2291 | 3305 | ||
2292 | rb_link_node(&new->node, parent, p); | 3306 | if (!reopen_osd(osd)) |
2293 | rb_insert_color(&new->node, &osdc->event_tree); | 3307 | kick_osd_requests(osd); |
3308 | maybe_request_map(osdc); | ||
3309 | |||
3310 | out_unlock: | ||
3311 | up_write(&osdc->lock); | ||
2294 | } | 3312 | } |
2295 | 3313 | ||
2296 | static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, | 3314 | /* |
2297 | u64 cookie) | 3315 | * Process osd watch notifications |
3316 | */ | ||
3317 | static void handle_watch_notify(struct ceph_osd_client *osdc, | ||
3318 | struct ceph_msg *msg) | ||
2298 | { | 3319 | { |
2299 | struct rb_node **p = &osdc->event_tree.rb_node; | 3320 | void *p = msg->front.iov_base; |
2300 | struct rb_node *parent = NULL; | 3321 | void *const end = p + msg->front.iov_len; |
2301 | struct ceph_osd_event *event = NULL; | 3322 | struct ceph_osd_linger_request *lreq; |
3323 | struct linger_work *lwork; | ||
3324 | u8 proto_ver, opcode; | ||
3325 | u64 cookie, notify_id; | ||
3326 | u64 notifier_id = 0; | ||
3327 | s32 return_code = 0; | ||
3328 | void *payload = NULL; | ||
3329 | u32 payload_len = 0; | ||
2302 | 3330 | ||
2303 | while (*p) { | 3331 | ceph_decode_8_safe(&p, end, proto_ver, bad); |
2304 | parent = *p; | 3332 | ceph_decode_8_safe(&p, end, opcode, bad); |
2305 | event = rb_entry(parent, struct ceph_osd_event, node); | 3333 | ceph_decode_64_safe(&p, end, cookie, bad); |
2306 | if (cookie < event->cookie) | 3334 | p += 8; /* skip ver */ |
2307 | p = &(*p)->rb_left; | 3335 | ceph_decode_64_safe(&p, end, notify_id, bad); |
2308 | else if (cookie > event->cookie) | 3336 | |
2309 | p = &(*p)->rb_right; | 3337 | if (proto_ver >= 1) { |
2310 | else | 3338 | ceph_decode_32_safe(&p, end, payload_len, bad); |
2311 | return event; | 3339 | ceph_decode_need(&p, end, payload_len, bad); |
3340 | payload = p; | ||
3341 | p += payload_len; | ||
2312 | } | 3342 | } |
2313 | return NULL; | ||
2314 | } | ||
2315 | 3343 | ||
2316 | static void __remove_event(struct ceph_osd_event *event) | 3344 | if (le16_to_cpu(msg->hdr.version) >= 2) |
2317 | { | 3345 | ceph_decode_32_safe(&p, end, return_code, bad); |
2318 | struct ceph_osd_client *osdc = event->osdc; | ||
2319 | 3346 | ||
2320 | if (!RB_EMPTY_NODE(&event->node)) { | 3347 | if (le16_to_cpu(msg->hdr.version) >= 3) |
2321 | dout("__remove_event removed %p\n", event); | 3348 | ceph_decode_64_safe(&p, end, notifier_id, bad); |
2322 | rb_erase(&event->node, &osdc->event_tree); | 3349 | |
2323 | ceph_osdc_put_event(event); | 3350 | down_read(&osdc->lock); |
3351 | lreq = lookup_linger_osdc(&osdc->linger_requests, cookie); | ||
3352 | if (!lreq) { | ||
3353 | dout("%s opcode %d cookie %llu dne\n", __func__, opcode, | ||
3354 | cookie); | ||
3355 | goto out_unlock_osdc; | ||
3356 | } | ||
3357 | |||
3358 | mutex_lock(&lreq->lock); | ||
3359 | dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__, | ||
3360 | opcode, cookie, lreq, lreq->is_watch); | ||
3361 | if (opcode == CEPH_WATCH_EVENT_DISCONNECT) { | ||
3362 | if (!lreq->last_error) { | ||
3363 | lreq->last_error = -ENOTCONN; | ||
3364 | queue_watch_error(lreq); | ||
3365 | } | ||
3366 | } else if (!lreq->is_watch) { | ||
3367 | /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */ | ||
3368 | if (lreq->notify_id && lreq->notify_id != notify_id) { | ||
3369 | dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq, | ||
3370 | lreq->notify_id, notify_id); | ||
3371 | } else if (!completion_done(&lreq->notify_finish_wait)) { | ||
3372 | struct ceph_msg_data *data = | ||
3373 | list_first_entry_or_null(&msg->data, | ||
3374 | struct ceph_msg_data, | ||
3375 | links); | ||
3376 | |||
3377 | if (data) { | ||
3378 | if (lreq->preply_pages) { | ||
3379 | WARN_ON(data->type != | ||
3380 | CEPH_MSG_DATA_PAGES); | ||
3381 | *lreq->preply_pages = data->pages; | ||
3382 | *lreq->preply_len = data->length; | ||
3383 | } else { | ||
3384 | ceph_release_page_vector(data->pages, | ||
3385 | calc_pages_for(0, data->length)); | ||
3386 | } | ||
3387 | } | ||
3388 | lreq->notify_finish_error = return_code; | ||
3389 | complete_all(&lreq->notify_finish_wait); | ||
3390 | } | ||
2324 | } else { | 3391 | } else { |
2325 | dout("__remove_event didn't remove %p\n", event); | 3392 | /* CEPH_WATCH_EVENT_NOTIFY */ |
3393 | lwork = lwork_alloc(lreq, do_watch_notify); | ||
3394 | if (!lwork) { | ||
3395 | pr_err("failed to allocate notify-lwork\n"); | ||
3396 | goto out_unlock_lreq; | ||
3397 | } | ||
3398 | |||
3399 | lwork->notify.notify_id = notify_id; | ||
3400 | lwork->notify.notifier_id = notifier_id; | ||
3401 | lwork->notify.payload = payload; | ||
3402 | lwork->notify.payload_len = payload_len; | ||
3403 | lwork->notify.msg = ceph_msg_get(msg); | ||
3404 | lwork_queue(lwork); | ||
2326 | } | 3405 | } |
3406 | |||
3407 | out_unlock_lreq: | ||
3408 | mutex_unlock(&lreq->lock); | ||
3409 | out_unlock_osdc: | ||
3410 | up_read(&osdc->lock); | ||
3411 | return; | ||
3412 | |||
3413 | bad: | ||
3414 | pr_err("osdc handle_watch_notify corrupt msg\n"); | ||
2327 | } | 3415 | } |
2328 | 3416 | ||
2329 | int ceph_osdc_create_event(struct ceph_osd_client *osdc, | 3417 | /* |
2330 | void (*event_cb)(u64, u64, u8, void *), | 3418 | * Register request, send initial attempt. |
2331 | void *data, struct ceph_osd_event **pevent) | 3419 | */ |
3420 | int ceph_osdc_start_request(struct ceph_osd_client *osdc, | ||
3421 | struct ceph_osd_request *req, | ||
3422 | bool nofail) | ||
2332 | { | 3423 | { |
2333 | struct ceph_osd_event *event; | 3424 | down_read(&osdc->lock); |
2334 | 3425 | submit_request(req, false); | |
2335 | event = kmalloc(sizeof(*event), GFP_NOIO); | 3426 | up_read(&osdc->lock); |
2336 | if (!event) | ||
2337 | return -ENOMEM; | ||
2338 | 3427 | ||
2339 | dout("create_event %p\n", event); | ||
2340 | event->cb = event_cb; | ||
2341 | event->one_shot = 0; | ||
2342 | event->data = data; | ||
2343 | event->osdc = osdc; | ||
2344 | INIT_LIST_HEAD(&event->osd_node); | ||
2345 | RB_CLEAR_NODE(&event->node); | ||
2346 | kref_init(&event->kref); /* one ref for us */ | ||
2347 | kref_get(&event->kref); /* one ref for the caller */ | ||
2348 | |||
2349 | spin_lock(&osdc->event_lock); | ||
2350 | event->cookie = ++osdc->event_count; | ||
2351 | __insert_event(osdc, event); | ||
2352 | spin_unlock(&osdc->event_lock); | ||
2353 | |||
2354 | *pevent = event; | ||
2355 | return 0; | 3428 | return 0; |
2356 | } | 3429 | } |
2357 | EXPORT_SYMBOL(ceph_osdc_create_event); | 3430 | EXPORT_SYMBOL(ceph_osdc_start_request); |
2358 | 3431 | ||
2359 | void ceph_osdc_cancel_event(struct ceph_osd_event *event) | 3432 | /* |
3433 | * Unregister a registered request. The request is not completed (i.e. | ||
3434 | * no callbacks or wakeups) - higher layers are supposed to know what | ||
3435 | * they are canceling. | ||
3436 | */ | ||
3437 | void ceph_osdc_cancel_request(struct ceph_osd_request *req) | ||
2360 | { | 3438 | { |
2361 | struct ceph_osd_client *osdc = event->osdc; | 3439 | struct ceph_osd_client *osdc = req->r_osdc; |
2362 | 3440 | ||
2363 | dout("cancel_event %p\n", event); | 3441 | down_write(&osdc->lock); |
2364 | spin_lock(&osdc->event_lock); | 3442 | if (req->r_osd) |
2365 | __remove_event(event); | 3443 | cancel_request(req); |
2366 | spin_unlock(&osdc->event_lock); | 3444 | up_write(&osdc->lock); |
2367 | ceph_osdc_put_event(event); /* caller's */ | ||
2368 | } | 3445 | } |
2369 | EXPORT_SYMBOL(ceph_osdc_cancel_event); | 3446 | EXPORT_SYMBOL(ceph_osdc_cancel_request); |
2370 | |||
2371 | 3447 | ||
2372 | static void do_event_work(struct work_struct *work) | 3448 | /* |
3449 | * @timeout: in jiffies, 0 means "wait forever" | ||
3450 | */ | ||
3451 | static int wait_request_timeout(struct ceph_osd_request *req, | ||
3452 | unsigned long timeout) | ||
2373 | { | 3453 | { |
2374 | struct ceph_osd_event_work *event_work = | 3454 | long left; |
2375 | container_of(work, struct ceph_osd_event_work, work); | ||
2376 | struct ceph_osd_event *event = event_work->event; | ||
2377 | u64 ver = event_work->ver; | ||
2378 | u64 notify_id = event_work->notify_id; | ||
2379 | u8 opcode = event_work->opcode; | ||
2380 | 3455 | ||
2381 | dout("do_event_work completing %p\n", event); | 3456 | dout("%s req %p tid %llu\n", __func__, req, req->r_tid); |
2382 | event->cb(ver, notify_id, opcode, event->data); | 3457 | left = wait_for_completion_killable_timeout(&req->r_completion, |
2383 | dout("do_event_work completed %p\n", event); | 3458 | ceph_timeout_jiffies(timeout)); |
2384 | ceph_osdc_put_event(event); | 3459 | if (left <= 0) { |
2385 | kfree(event_work); | 3460 | left = left ?: -ETIMEDOUT; |
3461 | ceph_osdc_cancel_request(req); | ||
3462 | |||
3463 | /* kludge - need to to wake ceph_osdc_sync() */ | ||
3464 | complete_all(&req->r_safe_completion); | ||
3465 | } else { | ||
3466 | left = req->r_result; /* completed */ | ||
3467 | } | ||
3468 | |||
3469 | return left; | ||
2386 | } | 3470 | } |
2387 | 3471 | ||
3472 | /* | ||
3473 | * wait for a request to complete | ||
3474 | */ | ||
3475 | int ceph_osdc_wait_request(struct ceph_osd_client *osdc, | ||
3476 | struct ceph_osd_request *req) | ||
3477 | { | ||
3478 | return wait_request_timeout(req, 0); | ||
3479 | } | ||
3480 | EXPORT_SYMBOL(ceph_osdc_wait_request); | ||
2388 | 3481 | ||
2389 | /* | 3482 | /* |
2390 | * Process osd watch notifications | 3483 | * sync - wait for all in-flight requests to flush. avoid starvation. |
2391 | */ | 3484 | */ |
2392 | static void handle_watch_notify(struct ceph_osd_client *osdc, | 3485 | void ceph_osdc_sync(struct ceph_osd_client *osdc) |
2393 | struct ceph_msg *msg) | ||
2394 | { | 3486 | { |
2395 | void *p, *end; | 3487 | struct rb_node *n, *p; |
2396 | u8 proto_ver; | 3488 | u64 last_tid = atomic64_read(&osdc->last_tid); |
2397 | u64 cookie, ver, notify_id; | ||
2398 | u8 opcode; | ||
2399 | struct ceph_osd_event *event; | ||
2400 | struct ceph_osd_event_work *event_work; | ||
2401 | 3489 | ||
2402 | p = msg->front.iov_base; | 3490 | again: |
2403 | end = p + msg->front.iov_len; | 3491 | down_read(&osdc->lock); |
3492 | for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { | ||
3493 | struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); | ||
2404 | 3494 | ||
2405 | ceph_decode_8_safe(&p, end, proto_ver, bad); | 3495 | mutex_lock(&osd->lock); |
2406 | ceph_decode_8_safe(&p, end, opcode, bad); | 3496 | for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) { |
2407 | ceph_decode_64_safe(&p, end, cookie, bad); | 3497 | struct ceph_osd_request *req = |
2408 | ceph_decode_64_safe(&p, end, ver, bad); | 3498 | rb_entry(p, struct ceph_osd_request, r_node); |
2409 | ceph_decode_64_safe(&p, end, notify_id, bad); | 3499 | |
3500 | if (req->r_tid > last_tid) | ||
3501 | break; | ||
3502 | |||
3503 | if (!(req->r_flags & CEPH_OSD_FLAG_WRITE)) | ||
3504 | continue; | ||
2410 | 3505 | ||
2411 | spin_lock(&osdc->event_lock); | 3506 | ceph_osdc_get_request(req); |
2412 | event = __find_event(osdc, cookie); | 3507 | mutex_unlock(&osd->lock); |
2413 | if (event) { | 3508 | up_read(&osdc->lock); |
2414 | BUG_ON(event->one_shot); | 3509 | dout("%s waiting on req %p tid %llu last_tid %llu\n", |
2415 | get_event(event); | 3510 | __func__, req, req->r_tid, last_tid); |
2416 | } | 3511 | wait_for_completion(&req->r_safe_completion); |
2417 | spin_unlock(&osdc->event_lock); | 3512 | ceph_osdc_put_request(req); |
2418 | dout("handle_watch_notify cookie %lld ver %lld event %p\n", | 3513 | goto again; |
2419 | cookie, ver, event); | ||
2420 | if (event) { | ||
2421 | event_work = kmalloc(sizeof(*event_work), GFP_NOIO); | ||
2422 | if (!event_work) { | ||
2423 | pr_err("couldn't allocate event_work\n"); | ||
2424 | ceph_osdc_put_event(event); | ||
2425 | return; | ||
2426 | } | 3514 | } |
2427 | INIT_WORK(&event_work->work, do_event_work); | ||
2428 | event_work->event = event; | ||
2429 | event_work->ver = ver; | ||
2430 | event_work->notify_id = notify_id; | ||
2431 | event_work->opcode = opcode; | ||
2432 | 3515 | ||
2433 | queue_work(osdc->notify_wq, &event_work->work); | 3516 | mutex_unlock(&osd->lock); |
2434 | } | 3517 | } |
2435 | 3518 | ||
2436 | return; | 3519 | up_read(&osdc->lock); |
3520 | dout("%s done last_tid %llu\n", __func__, last_tid); | ||
3521 | } | ||
3522 | EXPORT_SYMBOL(ceph_osdc_sync); | ||
2437 | 3523 | ||
2438 | bad: | 3524 | static struct ceph_osd_request * |
2439 | pr_err("osdc handle_watch_notify corrupt msg\n"); | 3525 | alloc_linger_request(struct ceph_osd_linger_request *lreq) |
3526 | { | ||
3527 | struct ceph_osd_request *req; | ||
3528 | |||
3529 | req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO); | ||
3530 | if (!req) | ||
3531 | return NULL; | ||
3532 | |||
3533 | ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); | ||
3534 | ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); | ||
3535 | |||
3536 | if (ceph_osdc_alloc_messages(req, GFP_NOIO)) { | ||
3537 | ceph_osdc_put_request(req); | ||
3538 | return NULL; | ||
3539 | } | ||
3540 | |||
3541 | return req; | ||
2440 | } | 3542 | } |
2441 | 3543 | ||
2442 | /* | 3544 | /* |
2443 | * build new request AND message | 3545 | * Returns a handle, caller owns a ref. |
2444 | * | ||
2445 | */ | 3546 | */ |
2446 | void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, | 3547 | struct ceph_osd_linger_request * |
2447 | struct ceph_snap_context *snapc, u64 snap_id, | 3548 | ceph_osdc_watch(struct ceph_osd_client *osdc, |
2448 | struct timespec *mtime) | 3549 | struct ceph_object_id *oid, |
2449 | { | 3550 | struct ceph_object_locator *oloc, |
2450 | struct ceph_msg *msg = req->r_request; | 3551 | rados_watchcb2_t wcb, |
2451 | void *p; | 3552 | rados_watcherrcb_t errcb, |
2452 | size_t msg_size; | 3553 | void *data) |
2453 | int flags = req->r_flags; | 3554 | { |
2454 | u64 data_len; | 3555 | struct ceph_osd_linger_request *lreq; |
2455 | unsigned int i; | 3556 | int ret; |
2456 | |||
2457 | req->r_snapid = snap_id; | ||
2458 | req->r_snapc = ceph_get_snap_context(snapc); | ||
2459 | |||
2460 | /* encode request */ | ||
2461 | msg->hdr.version = cpu_to_le16(4); | ||
2462 | |||
2463 | p = msg->front.iov_base; | ||
2464 | ceph_encode_32(&p, 1); /* client_inc is always 1 */ | ||
2465 | req->r_request_osdmap_epoch = p; | ||
2466 | p += 4; | ||
2467 | req->r_request_flags = p; | ||
2468 | p += 4; | ||
2469 | if (req->r_flags & CEPH_OSD_FLAG_WRITE) | ||
2470 | ceph_encode_timespec(p, mtime); | ||
2471 | p += sizeof(struct ceph_timespec); | ||
2472 | req->r_request_reassert_version = p; | ||
2473 | p += sizeof(struct ceph_eversion); /* will get filled in */ | ||
2474 | |||
2475 | /* oloc */ | ||
2476 | ceph_encode_8(&p, 4); | ||
2477 | ceph_encode_8(&p, 4); | ||
2478 | ceph_encode_32(&p, 8 + 4 + 4); | ||
2479 | req->r_request_pool = p; | ||
2480 | p += 8; | ||
2481 | ceph_encode_32(&p, -1); /* preferred */ | ||
2482 | ceph_encode_32(&p, 0); /* key len */ | ||
2483 | 3557 | ||
2484 | ceph_encode_8(&p, 1); | 3558 | lreq = linger_alloc(osdc); |
2485 | req->r_request_pgid = p; | 3559 | if (!lreq) |
2486 | p += 8 + 4; | 3560 | return ERR_PTR(-ENOMEM); |
2487 | ceph_encode_32(&p, -1); /* preferred */ | ||
2488 | 3561 | ||
2489 | /* oid */ | 3562 | lreq->is_watch = true; |
2490 | ceph_encode_32(&p, req->r_base_oid.name_len); | 3563 | lreq->wcb = wcb; |
2491 | memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len); | 3564 | lreq->errcb = errcb; |
2492 | dout("oid '%.*s' len %d\n", req->r_base_oid.name_len, | 3565 | lreq->data = data; |
2493 | req->r_base_oid.name, req->r_base_oid.name_len); | 3566 | lreq->watch_valid_thru = jiffies; |
2494 | p += req->r_base_oid.name_len; | 3567 | |
2495 | 3568 | ceph_oid_copy(&lreq->t.base_oid, oid); | |
2496 | /* ops--can imply data */ | 3569 | ceph_oloc_copy(&lreq->t.base_oloc, oloc); |
2497 | ceph_encode_16(&p, (u16)req->r_num_ops); | 3570 | lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; |
2498 | data_len = 0; | 3571 | lreq->mtime = CURRENT_TIME; |
2499 | for (i = 0; i < req->r_num_ops; i++) { | 3572 | |
2500 | data_len += osd_req_encode_op(req, p, i); | 3573 | lreq->reg_req = alloc_linger_request(lreq); |
2501 | p += sizeof(struct ceph_osd_op); | 3574 | if (!lreq->reg_req) { |
3575 | ret = -ENOMEM; | ||
3576 | goto err_put_lreq; | ||
2502 | } | 3577 | } |
2503 | 3578 | ||
2504 | /* snaps */ | 3579 | lreq->ping_req = alloc_linger_request(lreq); |
2505 | ceph_encode_64(&p, req->r_snapid); | 3580 | if (!lreq->ping_req) { |
2506 | ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); | 3581 | ret = -ENOMEM; |
2507 | ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); | 3582 | goto err_put_lreq; |
2508 | if (req->r_snapc) { | ||
2509 | for (i = 0; i < snapc->num_snaps; i++) { | ||
2510 | ceph_encode_64(&p, req->r_snapc->snaps[i]); | ||
2511 | } | ||
2512 | } | 3583 | } |
2513 | 3584 | ||
2514 | req->r_request_attempts = p; | 3585 | down_write(&osdc->lock); |
2515 | p += 4; | 3586 | linger_register(lreq); /* before osd_req_op_* */ |
2516 | 3587 | osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id, | |
2517 | /* data */ | 3588 | CEPH_OSD_WATCH_OP_WATCH); |
2518 | if (flags & CEPH_OSD_FLAG_WRITE) { | 3589 | osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id, |
2519 | u16 data_off; | 3590 | CEPH_OSD_WATCH_OP_PING); |
2520 | 3591 | linger_submit(lreq); | |
2521 | /* | 3592 | up_write(&osdc->lock); |
2522 | * The header "data_off" is a hint to the receiver | 3593 | |
2523 | * allowing it to align received data into its | 3594 | ret = linger_reg_commit_wait(lreq); |
2524 | * buffers such that there's no need to re-copy | 3595 | if (ret) { |
2525 | * it before writing it to disk (direct I/O). | 3596 | linger_cancel(lreq); |
2526 | */ | 3597 | goto err_put_lreq; |
2527 | data_off = (u16) (off & 0xffff); | ||
2528 | req->r_request->hdr.data_off = cpu_to_le16(data_off); | ||
2529 | } | 3598 | } |
2530 | req->r_request->hdr.data_len = cpu_to_le32(data_len); | ||
2531 | 3599 | ||
2532 | BUG_ON(p > msg->front.iov_base + msg->front.iov_len); | 3600 | return lreq; |
2533 | msg_size = p - msg->front.iov_base; | ||
2534 | msg->front.iov_len = msg_size; | ||
2535 | msg->hdr.front_len = cpu_to_le32(msg_size); | ||
2536 | 3601 | ||
2537 | dout("build_request msg_size was %d\n", (int)msg_size); | 3602 | err_put_lreq: |
3603 | linger_put(lreq); | ||
3604 | return ERR_PTR(ret); | ||
2538 | } | 3605 | } |
2539 | EXPORT_SYMBOL(ceph_osdc_build_request); | 3606 | EXPORT_SYMBOL(ceph_osdc_watch); |
2540 | 3607 | ||
2541 | /* | 3608 | /* |
2542 | * Register request, send initial attempt. | 3609 | * Releases a ref. |
3610 | * | ||
3611 | * Times out after mount_timeout to preserve rbd unmap behaviour | ||
3612 | * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap | ||
3613 | * with mount_timeout"). | ||
2543 | */ | 3614 | */ |
2544 | int ceph_osdc_start_request(struct ceph_osd_client *osdc, | 3615 | int ceph_osdc_unwatch(struct ceph_osd_client *osdc, |
2545 | struct ceph_osd_request *req, | 3616 | struct ceph_osd_linger_request *lreq) |
2546 | bool nofail) | ||
2547 | { | 3617 | { |
2548 | int rc; | 3618 | struct ceph_options *opts = osdc->client->options; |
3619 | struct ceph_osd_request *req; | ||
3620 | int ret; | ||
2549 | 3621 | ||
2550 | down_read(&osdc->map_sem); | 3622 | req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); |
2551 | mutex_lock(&osdc->request_mutex); | 3623 | if (!req) |
3624 | return -ENOMEM; | ||
3625 | |||
3626 | ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); | ||
3627 | ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); | ||
3628 | req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; | ||
3629 | req->r_mtime = CURRENT_TIME; | ||
3630 | osd_req_op_watch_init(req, 0, lreq->linger_id, | ||
3631 | CEPH_OSD_WATCH_OP_UNWATCH); | ||
2552 | 3632 | ||
2553 | rc = __ceph_osdc_start_request(osdc, req, nofail); | 3633 | ret = ceph_osdc_alloc_messages(req, GFP_NOIO); |
3634 | if (ret) | ||
3635 | goto out_put_req; | ||
2554 | 3636 | ||
2555 | mutex_unlock(&osdc->request_mutex); | 3637 | ceph_osdc_start_request(osdc, req, false); |
2556 | up_read(&osdc->map_sem); | 3638 | linger_cancel(lreq); |
3639 | linger_put(lreq); | ||
3640 | ret = wait_request_timeout(req, opts->mount_timeout); | ||
2557 | 3641 | ||
2558 | return rc; | 3642 | out_put_req: |
3643 | ceph_osdc_put_request(req); | ||
3644 | return ret; | ||
2559 | } | 3645 | } |
2560 | EXPORT_SYMBOL(ceph_osdc_start_request); | 3646 | EXPORT_SYMBOL(ceph_osdc_unwatch); |
2561 | 3647 | ||
2562 | /* | 3648 | static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which, |
2563 | * Unregister a registered request. The request is not completed (i.e. | 3649 | u64 notify_id, u64 cookie, void *payload, |
2564 | * no callbacks or wakeups) - higher layers are supposed to know what | 3650 | size_t payload_len) |
2565 | * they are canceling. | ||
2566 | */ | ||
2567 | void ceph_osdc_cancel_request(struct ceph_osd_request *req) | ||
2568 | { | 3651 | { |
2569 | struct ceph_osd_client *osdc = req->r_osdc; | 3652 | struct ceph_osd_req_op *op; |
3653 | struct ceph_pagelist *pl; | ||
3654 | int ret; | ||
2570 | 3655 | ||
2571 | mutex_lock(&osdc->request_mutex); | 3656 | op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0); |
2572 | if (req->r_linger) | 3657 | |
2573 | __unregister_linger_request(osdc, req); | 3658 | pl = kmalloc(sizeof(*pl), GFP_NOIO); |
2574 | __unregister_request(osdc, req); | 3659 | if (!pl) |
2575 | mutex_unlock(&osdc->request_mutex); | 3660 | return -ENOMEM; |
3661 | |||
3662 | ceph_pagelist_init(pl); | ||
3663 | ret = ceph_pagelist_encode_64(pl, notify_id); | ||
3664 | ret |= ceph_pagelist_encode_64(pl, cookie); | ||
3665 | if (payload) { | ||
3666 | ret |= ceph_pagelist_encode_32(pl, payload_len); | ||
3667 | ret |= ceph_pagelist_append(pl, payload, payload_len); | ||
3668 | } else { | ||
3669 | ret |= ceph_pagelist_encode_32(pl, 0); | ||
3670 | } | ||
3671 | if (ret) { | ||
3672 | ceph_pagelist_release(pl); | ||
3673 | return -ENOMEM; | ||
3674 | } | ||
2576 | 3675 | ||
2577 | dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid); | 3676 | ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl); |
3677 | op->indata_len = pl->length; | ||
3678 | return 0; | ||
2578 | } | 3679 | } |
2579 | EXPORT_SYMBOL(ceph_osdc_cancel_request); | ||
2580 | 3680 | ||
2581 | /* | 3681 | int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, |
2582 | * wait for a request to complete | 3682 | struct ceph_object_id *oid, |
2583 | */ | 3683 | struct ceph_object_locator *oloc, |
2584 | int ceph_osdc_wait_request(struct ceph_osd_client *osdc, | 3684 | u64 notify_id, |
2585 | struct ceph_osd_request *req) | 3685 | u64 cookie, |
3686 | void *payload, | ||
3687 | size_t payload_len) | ||
2586 | { | 3688 | { |
2587 | int rc; | 3689 | struct ceph_osd_request *req; |
3690 | int ret; | ||
2588 | 3691 | ||
2589 | dout("%s %p tid %llu\n", __func__, req, req->r_tid); | 3692 | req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); |
3693 | if (!req) | ||
3694 | return -ENOMEM; | ||
2590 | 3695 | ||
2591 | rc = wait_for_completion_interruptible(&req->r_completion); | 3696 | ceph_oid_copy(&req->r_base_oid, oid); |
2592 | if (rc < 0) { | 3697 | ceph_oloc_copy(&req->r_base_oloc, oloc); |
2593 | dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid); | 3698 | req->r_flags = CEPH_OSD_FLAG_READ; |
2594 | ceph_osdc_cancel_request(req); | 3699 | |
2595 | complete_request(req); | 3700 | ret = ceph_osdc_alloc_messages(req, GFP_NOIO); |
2596 | return rc; | 3701 | if (ret) |
3702 | goto out_put_req; | ||
3703 | |||
3704 | ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload, | ||
3705 | payload_len); | ||
3706 | if (ret) | ||
3707 | goto out_put_req; | ||
3708 | |||
3709 | ceph_osdc_start_request(osdc, req, false); | ||
3710 | ret = ceph_osdc_wait_request(osdc, req); | ||
3711 | |||
3712 | out_put_req: | ||
3713 | ceph_osdc_put_request(req); | ||
3714 | return ret; | ||
3715 | } | ||
3716 | EXPORT_SYMBOL(ceph_osdc_notify_ack); | ||
3717 | |||
3718 | static int osd_req_op_notify_init(struct ceph_osd_request *req, int which, | ||
3719 | u64 cookie, u32 prot_ver, u32 timeout, | ||
3720 | void *payload, size_t payload_len) | ||
3721 | { | ||
3722 | struct ceph_osd_req_op *op; | ||
3723 | struct ceph_pagelist *pl; | ||
3724 | int ret; | ||
3725 | |||
3726 | op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); | ||
3727 | op->notify.cookie = cookie; | ||
3728 | |||
3729 | pl = kmalloc(sizeof(*pl), GFP_NOIO); | ||
3730 | if (!pl) | ||
3731 | return -ENOMEM; | ||
3732 | |||
3733 | ceph_pagelist_init(pl); | ||
3734 | ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */ | ||
3735 | ret |= ceph_pagelist_encode_32(pl, timeout); | ||
3736 | ret |= ceph_pagelist_encode_32(pl, payload_len); | ||
3737 | ret |= ceph_pagelist_append(pl, payload, payload_len); | ||
3738 | if (ret) { | ||
3739 | ceph_pagelist_release(pl); | ||
3740 | return -ENOMEM; | ||
2597 | } | 3741 | } |
2598 | 3742 | ||
2599 | dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid, | 3743 | ceph_osd_data_pagelist_init(&op->notify.request_data, pl); |
2600 | req->r_result); | 3744 | op->indata_len = pl->length; |
2601 | return req->r_result; | 3745 | return 0; |
2602 | } | 3746 | } |
2603 | EXPORT_SYMBOL(ceph_osdc_wait_request); | ||
2604 | 3747 | ||
2605 | /* | 3748 | /* |
2606 | * sync - wait for all in-flight requests to flush. avoid starvation. | 3749 | * @timeout: in seconds |
3750 | * | ||
3751 | * @preply_{pages,len} are initialized both on success and error. | ||
3752 | * The caller is responsible for: | ||
3753 | * | ||
3754 | * ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)) | ||
2607 | */ | 3755 | */ |
2608 | void ceph_osdc_sync(struct ceph_osd_client *osdc) | 3756 | int ceph_osdc_notify(struct ceph_osd_client *osdc, |
3757 | struct ceph_object_id *oid, | ||
3758 | struct ceph_object_locator *oloc, | ||
3759 | void *payload, | ||
3760 | size_t payload_len, | ||
3761 | u32 timeout, | ||
3762 | struct page ***preply_pages, | ||
3763 | size_t *preply_len) | ||
2609 | { | 3764 | { |
2610 | struct ceph_osd_request *req; | 3765 | struct ceph_osd_linger_request *lreq; |
2611 | u64 last_tid, next_tid = 0; | 3766 | struct page **pages; |
3767 | int ret; | ||
2612 | 3768 | ||
2613 | mutex_lock(&osdc->request_mutex); | 3769 | WARN_ON(!timeout); |
2614 | last_tid = osdc->last_tid; | 3770 | if (preply_pages) { |
2615 | while (1) { | 3771 | *preply_pages = NULL; |
2616 | req = __lookup_request_ge(osdc, next_tid); | 3772 | *preply_len = 0; |
2617 | if (!req) | 3773 | } |
2618 | break; | ||
2619 | if (req->r_tid > last_tid) | ||
2620 | break; | ||
2621 | 3774 | ||
2622 | next_tid = req->r_tid + 1; | 3775 | lreq = linger_alloc(osdc); |
2623 | if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0) | 3776 | if (!lreq) |
2624 | continue; | 3777 | return -ENOMEM; |
2625 | 3778 | ||
2626 | ceph_osdc_get_request(req); | 3779 | lreq->preply_pages = preply_pages; |
2627 | mutex_unlock(&osdc->request_mutex); | 3780 | lreq->preply_len = preply_len; |
2628 | dout("sync waiting on tid %llu (last is %llu)\n", | 3781 | |
2629 | req->r_tid, last_tid); | 3782 | ceph_oid_copy(&lreq->t.base_oid, oid); |
2630 | wait_for_completion(&req->r_safe_completion); | 3783 | ceph_oloc_copy(&lreq->t.base_oloc, oloc); |
2631 | mutex_lock(&osdc->request_mutex); | 3784 | lreq->t.flags = CEPH_OSD_FLAG_READ; |
2632 | ceph_osdc_put_request(req); | 3785 | |
3786 | lreq->reg_req = alloc_linger_request(lreq); | ||
3787 | if (!lreq->reg_req) { | ||
3788 | ret = -ENOMEM; | ||
3789 | goto out_put_lreq; | ||
2633 | } | 3790 | } |
2634 | mutex_unlock(&osdc->request_mutex); | 3791 | |
2635 | dout("sync done (thru tid %llu)\n", last_tid); | 3792 | /* for notify_id */ |
3793 | pages = ceph_alloc_page_vector(1, GFP_NOIO); | ||
3794 | if (IS_ERR(pages)) { | ||
3795 | ret = PTR_ERR(pages); | ||
3796 | goto out_put_lreq; | ||
3797 | } | ||
3798 | |||
3799 | down_write(&osdc->lock); | ||
3800 | linger_register(lreq); /* before osd_req_op_* */ | ||
3801 | ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1, | ||
3802 | timeout, payload, payload_len); | ||
3803 | if (ret) { | ||
3804 | linger_unregister(lreq); | ||
3805 | up_write(&osdc->lock); | ||
3806 | ceph_release_page_vector(pages, 1); | ||
3807 | goto out_put_lreq; | ||
3808 | } | ||
3809 | ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify, | ||
3810 | response_data), | ||
3811 | pages, PAGE_SIZE, 0, false, true); | ||
3812 | linger_submit(lreq); | ||
3813 | up_write(&osdc->lock); | ||
3814 | |||
3815 | ret = linger_reg_commit_wait(lreq); | ||
3816 | if (!ret) | ||
3817 | ret = linger_notify_finish_wait(lreq); | ||
3818 | else | ||
3819 | dout("lreq %p failed to initiate notify %d\n", lreq, ret); | ||
3820 | |||
3821 | linger_cancel(lreq); | ||
3822 | out_put_lreq: | ||
3823 | linger_put(lreq); | ||
3824 | return ret; | ||
3825 | } | ||
3826 | EXPORT_SYMBOL(ceph_osdc_notify); | ||
3827 | |||
3828 | /* | ||
3829 | * Return the number of milliseconds since the watch was last | ||
3830 | * confirmed, or an error. If there is an error, the watch is no | ||
3831 | * longer valid, and should be destroyed with ceph_osdc_unwatch(). | ||
3832 | */ | ||
3833 | int ceph_osdc_watch_check(struct ceph_osd_client *osdc, | ||
3834 | struct ceph_osd_linger_request *lreq) | ||
3835 | { | ||
3836 | unsigned long stamp, age; | ||
3837 | int ret; | ||
3838 | |||
3839 | down_read(&osdc->lock); | ||
3840 | mutex_lock(&lreq->lock); | ||
3841 | stamp = lreq->watch_valid_thru; | ||
3842 | if (!list_empty(&lreq->pending_lworks)) { | ||
3843 | struct linger_work *lwork = | ||
3844 | list_first_entry(&lreq->pending_lworks, | ||
3845 | struct linger_work, | ||
3846 | pending_item); | ||
3847 | |||
3848 | if (time_before(lwork->queued_stamp, stamp)) | ||
3849 | stamp = lwork->queued_stamp; | ||
3850 | } | ||
3851 | age = jiffies - stamp; | ||
3852 | dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__, | ||
3853 | lreq, lreq->linger_id, age, lreq->last_error); | ||
3854 | /* we are truncating to msecs, so return a safe upper bound */ | ||
3855 | ret = lreq->last_error ?: 1 + jiffies_to_msecs(age); | ||
3856 | |||
3857 | mutex_unlock(&lreq->lock); | ||
3858 | up_read(&osdc->lock); | ||
3859 | return ret; | ||
2636 | } | 3860 | } |
2637 | EXPORT_SYMBOL(ceph_osdc_sync); | ||
2638 | 3861 | ||
2639 | /* | 3862 | /* |
2640 | * Call all pending notify callbacks - for use after a watch is | 3863 | * Call all pending notify callbacks - for use after a watch is |
@@ -2646,6 +3869,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc) | |||
2646 | } | 3869 | } |
2647 | EXPORT_SYMBOL(ceph_osdc_flush_notifies); | 3870 | EXPORT_SYMBOL(ceph_osdc_flush_notifies); |
2648 | 3871 | ||
3872 | void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc) | ||
3873 | { | ||
3874 | down_read(&osdc->lock); | ||
3875 | maybe_request_map(osdc); | ||
3876 | up_read(&osdc->lock); | ||
3877 | } | ||
3878 | EXPORT_SYMBOL(ceph_osdc_maybe_request_map); | ||
2649 | 3879 | ||
2650 | /* | 3880 | /* |
2651 | * init, shutdown | 3881 | * init, shutdown |
@@ -2656,43 +3886,35 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) | |||
2656 | 3886 | ||
2657 | dout("init\n"); | 3887 | dout("init\n"); |
2658 | osdc->client = client; | 3888 | osdc->client = client; |
2659 | osdc->osdmap = NULL; | 3889 | init_rwsem(&osdc->lock); |
2660 | init_rwsem(&osdc->map_sem); | ||
2661 | init_completion(&osdc->map_waiters); | ||
2662 | osdc->last_requested_map = 0; | ||
2663 | mutex_init(&osdc->request_mutex); | ||
2664 | osdc->last_tid = 0; | ||
2665 | osdc->osds = RB_ROOT; | 3890 | osdc->osds = RB_ROOT; |
2666 | INIT_LIST_HEAD(&osdc->osd_lru); | 3891 | INIT_LIST_HEAD(&osdc->osd_lru); |
2667 | osdc->requests = RB_ROOT; | 3892 | spin_lock_init(&osdc->osd_lru_lock); |
2668 | INIT_LIST_HEAD(&osdc->req_lru); | 3893 | osd_init(&osdc->homeless_osd); |
2669 | INIT_LIST_HEAD(&osdc->req_unsent); | 3894 | osdc->homeless_osd.o_osdc = osdc; |
2670 | INIT_LIST_HEAD(&osdc->req_notarget); | 3895 | osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD; |
2671 | INIT_LIST_HEAD(&osdc->req_linger); | 3896 | osdc->linger_requests = RB_ROOT; |
2672 | osdc->num_requests = 0; | 3897 | osdc->map_checks = RB_ROOT; |
3898 | osdc->linger_map_checks = RB_ROOT; | ||
2673 | INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); | 3899 | INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); |
2674 | INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); | 3900 | INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); |
2675 | spin_lock_init(&osdc->event_lock); | ||
2676 | osdc->event_tree = RB_ROOT; | ||
2677 | osdc->event_count = 0; | ||
2678 | |||
2679 | schedule_delayed_work(&osdc->osds_timeout_work, | ||
2680 | round_jiffies_relative(osdc->client->options->osd_idle_ttl)); | ||
2681 | 3901 | ||
2682 | err = -ENOMEM; | 3902 | err = -ENOMEM; |
3903 | osdc->osdmap = ceph_osdmap_alloc(); | ||
3904 | if (!osdc->osdmap) | ||
3905 | goto out; | ||
3906 | |||
2683 | osdc->req_mempool = mempool_create_slab_pool(10, | 3907 | osdc->req_mempool = mempool_create_slab_pool(10, |
2684 | ceph_osd_request_cache); | 3908 | ceph_osd_request_cache); |
2685 | if (!osdc->req_mempool) | 3909 | if (!osdc->req_mempool) |
2686 | goto out; | 3910 | goto out_map; |
2687 | 3911 | ||
2688 | err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, | 3912 | err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, |
2689 | OSD_OP_FRONT_LEN, 10, true, | 3913 | PAGE_SIZE, 10, true, "osd_op"); |
2690 | "osd_op"); | ||
2691 | if (err < 0) | 3914 | if (err < 0) |
2692 | goto out_mempool; | 3915 | goto out_mempool; |
2693 | err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, | 3916 | err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, |
2694 | OSD_OPREPLY_FRONT_LEN, 10, true, | 3917 | PAGE_SIZE, 10, true, "osd_op_reply"); |
2695 | "osd_op_reply"); | ||
2696 | if (err < 0) | 3918 | if (err < 0) |
2697 | goto out_msgpool; | 3919 | goto out_msgpool; |
2698 | 3920 | ||
@@ -2701,6 +3923,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) | |||
2701 | if (!osdc->notify_wq) | 3923 | if (!osdc->notify_wq) |
2702 | goto out_msgpool_reply; | 3924 | goto out_msgpool_reply; |
2703 | 3925 | ||
3926 | schedule_delayed_work(&osdc->timeout_work, | ||
3927 | osdc->client->options->osd_keepalive_timeout); | ||
3928 | schedule_delayed_work(&osdc->osds_timeout_work, | ||
3929 | round_jiffies_relative(osdc->client->options->osd_idle_ttl)); | ||
3930 | |||
2704 | return 0; | 3931 | return 0; |
2705 | 3932 | ||
2706 | out_msgpool_reply: | 3933 | out_msgpool_reply: |
@@ -2709,6 +3936,8 @@ out_msgpool: | |||
2709 | ceph_msgpool_destroy(&osdc->msgpool_op); | 3936 | ceph_msgpool_destroy(&osdc->msgpool_op); |
2710 | out_mempool: | 3937 | out_mempool: |
2711 | mempool_destroy(osdc->req_mempool); | 3938 | mempool_destroy(osdc->req_mempool); |
3939 | out_map: | ||
3940 | ceph_osdmap_destroy(osdc->osdmap); | ||
2712 | out: | 3941 | out: |
2713 | return err; | 3942 | return err; |
2714 | } | 3943 | } |
@@ -2719,11 +3948,25 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) | |||
2719 | destroy_workqueue(osdc->notify_wq); | 3948 | destroy_workqueue(osdc->notify_wq); |
2720 | cancel_delayed_work_sync(&osdc->timeout_work); | 3949 | cancel_delayed_work_sync(&osdc->timeout_work); |
2721 | cancel_delayed_work_sync(&osdc->osds_timeout_work); | 3950 | cancel_delayed_work_sync(&osdc->osds_timeout_work); |
2722 | if (osdc->osdmap) { | 3951 | |
2723 | ceph_osdmap_destroy(osdc->osdmap); | 3952 | down_write(&osdc->lock); |
2724 | osdc->osdmap = NULL; | 3953 | while (!RB_EMPTY_ROOT(&osdc->osds)) { |
3954 | struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), | ||
3955 | struct ceph_osd, o_node); | ||
3956 | close_osd(osd); | ||
2725 | } | 3957 | } |
2726 | remove_all_osds(osdc); | 3958 | up_write(&osdc->lock); |
3959 | WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1); | ||
3960 | osd_cleanup(&osdc->homeless_osd); | ||
3961 | |||
3962 | WARN_ON(!list_empty(&osdc->osd_lru)); | ||
3963 | WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests)); | ||
3964 | WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks)); | ||
3965 | WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks)); | ||
3966 | WARN_ON(atomic_read(&osdc->num_requests)); | ||
3967 | WARN_ON(atomic_read(&osdc->num_homeless)); | ||
3968 | |||
3969 | ceph_osdmap_destroy(osdc->osdmap); | ||
2727 | mempool_destroy(osdc->req_mempool); | 3970 | mempool_destroy(osdc->req_mempool); |
2728 | ceph_msgpool_destroy(&osdc->msgpool_op); | 3971 | ceph_msgpool_destroy(&osdc->msgpool_op); |
2729 | ceph_msgpool_destroy(&osdc->msgpool_op_reply); | 3972 | ceph_msgpool_destroy(&osdc->msgpool_op_reply); |
@@ -2752,15 +3995,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, | |||
2752 | return PTR_ERR(req); | 3995 | return PTR_ERR(req); |
2753 | 3996 | ||
2754 | /* it may be a short read due to an object boundary */ | 3997 | /* it may be a short read due to an object boundary */ |
2755 | |||
2756 | osd_req_op_extent_osd_data_pages(req, 0, | 3998 | osd_req_op_extent_osd_data_pages(req, 0, |
2757 | pages, *plen, page_align, false, false); | 3999 | pages, *plen, page_align, false, false); |
2758 | 4000 | ||
2759 | dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", | 4001 | dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", |
2760 | off, *plen, *plen, page_align); | 4002 | off, *plen, *plen, page_align); |
2761 | 4003 | ||
2762 | ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); | ||
2763 | |||
2764 | rc = ceph_osdc_start_request(osdc, req, false); | 4004 | rc = ceph_osdc_start_request(osdc, req, false); |
2765 | if (!rc) | 4005 | if (!rc) |
2766 | rc = ceph_osdc_wait_request(osdc, req); | 4006 | rc = ceph_osdc_wait_request(osdc, req); |
@@ -2786,7 +4026,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, | |||
2786 | int rc = 0; | 4026 | int rc = 0; |
2787 | int page_align = off & ~PAGE_MASK; | 4027 | int page_align = off & ~PAGE_MASK; |
2788 | 4028 | ||
2789 | BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */ | ||
2790 | req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, | 4029 | req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, |
2791 | CEPH_OSD_OP_WRITE, | 4030 | CEPH_OSD_OP_WRITE, |
2792 | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, | 4031 | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, |
@@ -2800,8 +4039,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, | |||
2800 | false, false); | 4039 | false, false); |
2801 | dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); | 4040 | dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); |
2802 | 4041 | ||
2803 | ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime); | 4042 | req->r_mtime = *mtime; |
2804 | |||
2805 | rc = ceph_osdc_start_request(osdc, req, true); | 4043 | rc = ceph_osdc_start_request(osdc, req, true); |
2806 | if (!rc) | 4044 | if (!rc) |
2807 | rc = ceph_osdc_wait_request(osdc, req); | 4045 | rc = ceph_osdc_wait_request(osdc, req); |
@@ -2841,19 +4079,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup); | |||
2841 | static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) | 4079 | static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) |
2842 | { | 4080 | { |
2843 | struct ceph_osd *osd = con->private; | 4081 | struct ceph_osd *osd = con->private; |
2844 | struct ceph_osd_client *osdc; | 4082 | struct ceph_osd_client *osdc = osd->o_osdc; |
2845 | int type = le16_to_cpu(msg->hdr.type); | 4083 | int type = le16_to_cpu(msg->hdr.type); |
2846 | 4084 | ||
2847 | if (!osd) | ||
2848 | goto out; | ||
2849 | osdc = osd->o_osdc; | ||
2850 | |||
2851 | switch (type) { | 4085 | switch (type) { |
2852 | case CEPH_MSG_OSD_MAP: | 4086 | case CEPH_MSG_OSD_MAP: |
2853 | ceph_osdc_handle_map(osdc, msg); | 4087 | ceph_osdc_handle_map(osdc, msg); |
2854 | break; | 4088 | break; |
2855 | case CEPH_MSG_OSD_OPREPLY: | 4089 | case CEPH_MSG_OSD_OPREPLY: |
2856 | handle_reply(osdc, msg); | 4090 | handle_reply(osd, msg); |
2857 | break; | 4091 | break; |
2858 | case CEPH_MSG_WATCH_NOTIFY: | 4092 | case CEPH_MSG_WATCH_NOTIFY: |
2859 | handle_watch_notify(osdc, msg); | 4093 | handle_watch_notify(osdc, msg); |
@@ -2863,7 +4097,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) | |||
2863 | pr_err("received unknown message type %d %s\n", type, | 4097 | pr_err("received unknown message type %d %s\n", type, |
2864 | ceph_msg_type_name(type)); | 4098 | ceph_msg_type_name(type)); |
2865 | } | 4099 | } |
2866 | out: | 4100 | |
2867 | ceph_msg_put(msg); | 4101 | ceph_msg_put(msg); |
2868 | } | 4102 | } |
2869 | 4103 | ||
@@ -2878,21 +4112,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, | |||
2878 | { | 4112 | { |
2879 | struct ceph_osd *osd = con->private; | 4113 | struct ceph_osd *osd = con->private; |
2880 | struct ceph_osd_client *osdc = osd->o_osdc; | 4114 | struct ceph_osd_client *osdc = osd->o_osdc; |
2881 | struct ceph_msg *m; | 4115 | struct ceph_msg *m = NULL; |
2882 | struct ceph_osd_request *req; | 4116 | struct ceph_osd_request *req; |
2883 | int front_len = le32_to_cpu(hdr->front_len); | 4117 | int front_len = le32_to_cpu(hdr->front_len); |
2884 | int data_len = le32_to_cpu(hdr->data_len); | 4118 | int data_len = le32_to_cpu(hdr->data_len); |
2885 | u64 tid; | 4119 | u64 tid = le64_to_cpu(hdr->tid); |
2886 | 4120 | ||
2887 | tid = le64_to_cpu(hdr->tid); | 4121 | down_read(&osdc->lock); |
2888 | mutex_lock(&osdc->request_mutex); | 4122 | if (!osd_registered(osd)) { |
2889 | req = __lookup_request(osdc, tid); | 4123 | dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd); |
4124 | *skip = 1; | ||
4125 | goto out_unlock_osdc; | ||
4126 | } | ||
4127 | WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num)); | ||
4128 | |||
4129 | mutex_lock(&osd->lock); | ||
4130 | req = lookup_request(&osd->o_requests, tid); | ||
2890 | if (!req) { | 4131 | if (!req) { |
2891 | dout("%s osd%d tid %llu unknown, skipping\n", __func__, | 4132 | dout("%s osd%d tid %llu unknown, skipping\n", __func__, |
2892 | osd->o_osd, tid); | 4133 | osd->o_osd, tid); |
2893 | m = NULL; | ||
2894 | *skip = 1; | 4134 | *skip = 1; |
2895 | goto out; | 4135 | goto out_unlock_session; |
2896 | } | 4136 | } |
2897 | 4137 | ||
2898 | ceph_msg_revoke_incoming(req->r_reply); | 4138 | ceph_msg_revoke_incoming(req->r_reply); |
@@ -2904,7 +4144,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, | |||
2904 | m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, | 4144 | m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, |
2905 | false); | 4145 | false); |
2906 | if (!m) | 4146 | if (!m) |
2907 | goto out; | 4147 | goto out_unlock_session; |
2908 | ceph_msg_put(req->r_reply); | 4148 | ceph_msg_put(req->r_reply); |
2909 | req->r_reply = m; | 4149 | req->r_reply = m; |
2910 | } | 4150 | } |
@@ -2915,14 +4155,49 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, | |||
2915 | req->r_reply->data_length); | 4155 | req->r_reply->data_length); |
2916 | m = NULL; | 4156 | m = NULL; |
2917 | *skip = 1; | 4157 | *skip = 1; |
2918 | goto out; | 4158 | goto out_unlock_session; |
2919 | } | 4159 | } |
2920 | 4160 | ||
2921 | m = ceph_msg_get(req->r_reply); | 4161 | m = ceph_msg_get(req->r_reply); |
2922 | dout("get_reply tid %lld %p\n", tid, m); | 4162 | dout("get_reply tid %lld %p\n", tid, m); |
2923 | 4163 | ||
2924 | out: | 4164 | out_unlock_session: |
2925 | mutex_unlock(&osdc->request_mutex); | 4165 | mutex_unlock(&osd->lock); |
4166 | out_unlock_osdc: | ||
4167 | up_read(&osdc->lock); | ||
4168 | return m; | ||
4169 | } | ||
4170 | |||
4171 | /* | ||
4172 | * TODO: switch to a msg-owned pagelist | ||
4173 | */ | ||
4174 | static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) | ||
4175 | { | ||
4176 | struct ceph_msg *m; | ||
4177 | int type = le16_to_cpu(hdr->type); | ||
4178 | u32 front_len = le32_to_cpu(hdr->front_len); | ||
4179 | u32 data_len = le32_to_cpu(hdr->data_len); | ||
4180 | |||
4181 | m = ceph_msg_new(type, front_len, GFP_NOIO, false); | ||
4182 | if (!m) | ||
4183 | return NULL; | ||
4184 | |||
4185 | if (data_len) { | ||
4186 | struct page **pages; | ||
4187 | struct ceph_osd_data osd_data; | ||
4188 | |||
4189 | pages = ceph_alloc_page_vector(calc_pages_for(0, data_len), | ||
4190 | GFP_NOIO); | ||
4191 | if (!pages) { | ||
4192 | ceph_msg_put(m); | ||
4193 | return NULL; | ||
4194 | } | ||
4195 | |||
4196 | ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false, | ||
4197 | false); | ||
4198 | ceph_osdc_msg_data_add(m, &osd_data); | ||
4199 | } | ||
4200 | |||
2926 | return m; | 4201 | return m; |
2927 | } | 4202 | } |
2928 | 4203 | ||
@@ -2932,18 +4207,17 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, | |||
2932 | { | 4207 | { |
2933 | struct ceph_osd *osd = con->private; | 4208 | struct ceph_osd *osd = con->private; |
2934 | int type = le16_to_cpu(hdr->type); | 4209 | int type = le16_to_cpu(hdr->type); |
2935 | int front = le32_to_cpu(hdr->front_len); | ||
2936 | 4210 | ||
2937 | *skip = 0; | 4211 | *skip = 0; |
2938 | switch (type) { | 4212 | switch (type) { |
2939 | case CEPH_MSG_OSD_MAP: | 4213 | case CEPH_MSG_OSD_MAP: |
2940 | case CEPH_MSG_WATCH_NOTIFY: | 4214 | case CEPH_MSG_WATCH_NOTIFY: |
2941 | return ceph_msg_new(type, front, GFP_NOFS, false); | 4215 | return alloc_msg_with_page_vector(hdr); |
2942 | case CEPH_MSG_OSD_OPREPLY: | 4216 | case CEPH_MSG_OSD_OPREPLY: |
2943 | return get_reply(con, hdr, skip); | 4217 | return get_reply(con, hdr, skip); |
2944 | default: | 4218 | default: |
2945 | pr_info("alloc_msg unexpected msg type %d from osd%d\n", type, | 4219 | pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__, |
2946 | osd->o_osd); | 4220 | osd->o_osd, type); |
2947 | *skip = 1; | 4221 | *skip = 1; |
2948 | return NULL; | 4222 | return NULL; |
2949 | } | 4223 | } |
@@ -3047,5 +4321,5 @@ static const struct ceph_connection_operations osd_con_ops = { | |||
3047 | .alloc_msg = alloc_msg, | 4321 | .alloc_msg = alloc_msg, |
3048 | .sign_message = osd_sign_message, | 4322 | .sign_message = osd_sign_message, |
3049 | .check_message_signature = osd_check_message_signature, | 4323 | .check_message_signature = osd_check_message_signature, |
3050 | .fault = osd_reset, | 4324 | .fault = osd_fault, |
3051 | }; | 4325 | }; |
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 243574c8cf33..cde52e94732f 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c | |||
@@ -380,23 +380,24 @@ bad: | |||
380 | return ERR_PTR(err); | 380 | return ERR_PTR(err); |
381 | } | 381 | } |
382 | 382 | ||
383 | /* | 383 | int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs) |
384 | * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid | ||
385 | * to a set of osds) and primary_temp (explicit primary setting) | ||
386 | */ | ||
387 | static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) | ||
388 | { | 384 | { |
389 | if (l.pool < r.pool) | 385 | if (lhs->pool < rhs->pool) |
390 | return -1; | 386 | return -1; |
391 | if (l.pool > r.pool) | 387 | if (lhs->pool > rhs->pool) |
392 | return 1; | 388 | return 1; |
393 | if (l.seed < r.seed) | 389 | if (lhs->seed < rhs->seed) |
394 | return -1; | 390 | return -1; |
395 | if (l.seed > r.seed) | 391 | if (lhs->seed > rhs->seed) |
396 | return 1; | 392 | return 1; |
393 | |||
397 | return 0; | 394 | return 0; |
398 | } | 395 | } |
399 | 396 | ||
397 | /* | ||
398 | * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid | ||
399 | * to a set of osds) and primary_temp (explicit primary setting) | ||
400 | */ | ||
400 | static int __insert_pg_mapping(struct ceph_pg_mapping *new, | 401 | static int __insert_pg_mapping(struct ceph_pg_mapping *new, |
401 | struct rb_root *root) | 402 | struct rb_root *root) |
402 | { | 403 | { |
@@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new, | |||
409 | while (*p) { | 410 | while (*p) { |
410 | parent = *p; | 411 | parent = *p; |
411 | pg = rb_entry(parent, struct ceph_pg_mapping, node); | 412 | pg = rb_entry(parent, struct ceph_pg_mapping, node); |
412 | c = pgid_cmp(new->pgid, pg->pgid); | 413 | c = ceph_pg_compare(&new->pgid, &pg->pgid); |
413 | if (c < 0) | 414 | if (c < 0) |
414 | p = &(*p)->rb_left; | 415 | p = &(*p)->rb_left; |
415 | else if (c > 0) | 416 | else if (c > 0) |
@@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, | |||
432 | 433 | ||
433 | while (n) { | 434 | while (n) { |
434 | pg = rb_entry(n, struct ceph_pg_mapping, node); | 435 | pg = rb_entry(n, struct ceph_pg_mapping, node); |
435 | c = pgid_cmp(pgid, pg->pgid); | 436 | c = ceph_pg_compare(&pgid, &pg->pgid); |
436 | if (c < 0) { | 437 | if (c < 0) { |
437 | n = n->rb_left; | 438 | n = n->rb_left; |
438 | } else if (c > 0) { | 439 | } else if (c > 0) { |
@@ -596,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) | |||
596 | *p += 4; /* skip crash_replay_interval */ | 597 | *p += 4; /* skip crash_replay_interval */ |
597 | 598 | ||
598 | if (ev >= 7) | 599 | if (ev >= 7) |
599 | *p += 1; /* skip min_size */ | 600 | pi->min_size = ceph_decode_8(p); |
601 | else | ||
602 | pi->min_size = pi->size - pi->size / 2; | ||
600 | 603 | ||
601 | if (ev >= 8) | 604 | if (ev >= 8) |
602 | *p += 8 + 8; /* skip quota_max_* */ | 605 | *p += 8 + 8; /* skip quota_max_* */ |
@@ -616,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) | |||
616 | pi->write_tier = -1; | 619 | pi->write_tier = -1; |
617 | } | 620 | } |
618 | 621 | ||
622 | if (ev >= 10) { | ||
623 | /* skip properties */ | ||
624 | num = ceph_decode_32(p); | ||
625 | while (num--) { | ||
626 | len = ceph_decode_32(p); | ||
627 | *p += len; /* key */ | ||
628 | len = ceph_decode_32(p); | ||
629 | *p += len; /* val */ | ||
630 | } | ||
631 | } | ||
632 | |||
633 | if (ev >= 11) { | ||
634 | /* skip hit_set_params */ | ||
635 | *p += 1 + 1; /* versions */ | ||
636 | len = ceph_decode_32(p); | ||
637 | *p += len; | ||
638 | |||
639 | *p += 4; /* skip hit_set_period */ | ||
640 | *p += 4; /* skip hit_set_count */ | ||
641 | } | ||
642 | |||
643 | if (ev >= 12) | ||
644 | *p += 4; /* skip stripe_width */ | ||
645 | |||
646 | if (ev >= 13) { | ||
647 | *p += 8; /* skip target_max_bytes */ | ||
648 | *p += 8; /* skip target_max_objects */ | ||
649 | *p += 4; /* skip cache_target_dirty_ratio_micro */ | ||
650 | *p += 4; /* skip cache_target_full_ratio_micro */ | ||
651 | *p += 4; /* skip cache_min_flush_age */ | ||
652 | *p += 4; /* skip cache_min_evict_age */ | ||
653 | } | ||
654 | |||
655 | if (ev >= 14) { | ||
656 | /* skip erasure_code_profile */ | ||
657 | len = ceph_decode_32(p); | ||
658 | *p += len; | ||
659 | } | ||
660 | |||
661 | if (ev >= 15) | ||
662 | pi->last_force_request_resend = ceph_decode_32(p); | ||
663 | else | ||
664 | pi->last_force_request_resend = 0; | ||
665 | |||
619 | /* ignore the rest */ | 666 | /* ignore the rest */ |
620 | 667 | ||
621 | *p = pool_end; | 668 | *p = pool_end; |
@@ -660,6 +707,23 @@ bad: | |||
660 | /* | 707 | /* |
661 | * osd map | 708 | * osd map |
662 | */ | 709 | */ |
710 | struct ceph_osdmap *ceph_osdmap_alloc(void) | ||
711 | { | ||
712 | struct ceph_osdmap *map; | ||
713 | |||
714 | map = kzalloc(sizeof(*map), GFP_NOIO); | ||
715 | if (!map) | ||
716 | return NULL; | ||
717 | |||
718 | map->pg_pools = RB_ROOT; | ||
719 | map->pool_max = -1; | ||
720 | map->pg_temp = RB_ROOT; | ||
721 | map->primary_temp = RB_ROOT; | ||
722 | mutex_init(&map->crush_scratch_mutex); | ||
723 | |||
724 | return map; | ||
725 | } | ||
726 | |||
663 | void ceph_osdmap_destroy(struct ceph_osdmap *map) | 727 | void ceph_osdmap_destroy(struct ceph_osdmap *map) |
664 | { | 728 | { |
665 | dout("osdmap_destroy %p\n", map); | 729 | dout("osdmap_destroy %p\n", map); |
@@ -1183,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) | |||
1183 | struct ceph_osdmap *map; | 1247 | struct ceph_osdmap *map; |
1184 | int ret; | 1248 | int ret; |
1185 | 1249 | ||
1186 | map = kzalloc(sizeof(*map), GFP_NOFS); | 1250 | map = ceph_osdmap_alloc(); |
1187 | if (!map) | 1251 | if (!map) |
1188 | return ERR_PTR(-ENOMEM); | 1252 | return ERR_PTR(-ENOMEM); |
1189 | 1253 | ||
1190 | map->pg_temp = RB_ROOT; | ||
1191 | map->primary_temp = RB_ROOT; | ||
1192 | mutex_init(&map->crush_scratch_mutex); | ||
1193 | |||
1194 | ret = osdmap_decode(p, end, map); | 1254 | ret = osdmap_decode(p, end, map); |
1195 | if (ret) { | 1255 | if (ret) { |
1196 | ceph_osdmap_destroy(map); | 1256 | ceph_osdmap_destroy(map); |
@@ -1204,8 +1264,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) | |||
1204 | * decode and apply an incremental map update. | 1264 | * decode and apply an incremental map update. |
1205 | */ | 1265 | */ |
1206 | struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | 1266 | struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, |
1207 | struct ceph_osdmap *map, | 1267 | struct ceph_osdmap *map) |
1208 | struct ceph_messenger *msgr) | ||
1209 | { | 1268 | { |
1210 | struct crush_map *newcrush = NULL; | 1269 | struct crush_map *newcrush = NULL; |
1211 | struct ceph_fsid fsid; | 1270 | struct ceph_fsid fsid; |
@@ -1381,8 +1440,252 @@ bad: | |||
1381 | return ERR_PTR(err); | 1440 | return ERR_PTR(err); |
1382 | } | 1441 | } |
1383 | 1442 | ||
1443 | void ceph_oid_copy(struct ceph_object_id *dest, | ||
1444 | const struct ceph_object_id *src) | ||
1445 | { | ||
1446 | WARN_ON(!ceph_oid_empty(dest)); | ||
1447 | |||
1448 | if (src->name != src->inline_name) { | ||
1449 | /* very rare, see ceph_object_id definition */ | ||
1450 | dest->name = kmalloc(src->name_len + 1, | ||
1451 | GFP_NOIO | __GFP_NOFAIL); | ||
1452 | } | ||
1453 | |||
1454 | memcpy(dest->name, src->name, src->name_len + 1); | ||
1455 | dest->name_len = src->name_len; | ||
1456 | } | ||
1457 | EXPORT_SYMBOL(ceph_oid_copy); | ||
1458 | |||
1459 | static __printf(2, 0) | ||
1460 | int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap) | ||
1461 | { | ||
1462 | int len; | ||
1463 | |||
1464 | WARN_ON(!ceph_oid_empty(oid)); | ||
1465 | |||
1466 | len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap); | ||
1467 | if (len >= sizeof(oid->inline_name)) | ||
1468 | return len; | ||
1469 | |||
1470 | oid->name_len = len; | ||
1471 | return 0; | ||
1472 | } | ||
1473 | |||
1474 | /* | ||
1475 | * If oid doesn't fit into inline buffer, BUG. | ||
1476 | */ | ||
1477 | void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...) | ||
1478 | { | ||
1479 | va_list ap; | ||
1480 | |||
1481 | va_start(ap, fmt); | ||
1482 | BUG_ON(oid_printf_vargs(oid, fmt, ap)); | ||
1483 | va_end(ap); | ||
1484 | } | ||
1485 | EXPORT_SYMBOL(ceph_oid_printf); | ||
1486 | |||
1487 | static __printf(3, 0) | ||
1488 | int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp, | ||
1489 | const char *fmt, va_list ap) | ||
1490 | { | ||
1491 | va_list aq; | ||
1492 | int len; | ||
1493 | |||
1494 | va_copy(aq, ap); | ||
1495 | len = oid_printf_vargs(oid, fmt, aq); | ||
1496 | va_end(aq); | ||
1497 | |||
1498 | if (len) { | ||
1499 | char *external_name; | ||
1500 | |||
1501 | external_name = kmalloc(len + 1, gfp); | ||
1502 | if (!external_name) | ||
1503 | return -ENOMEM; | ||
1504 | |||
1505 | oid->name = external_name; | ||
1506 | WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len); | ||
1507 | oid->name_len = len; | ||
1508 | } | ||
1509 | |||
1510 | return 0; | ||
1511 | } | ||
1512 | |||
1513 | /* | ||
1514 | * If oid doesn't fit into inline buffer, allocate. | ||
1515 | */ | ||
1516 | int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp, | ||
1517 | const char *fmt, ...) | ||
1518 | { | ||
1519 | va_list ap; | ||
1520 | int ret; | ||
1521 | |||
1522 | va_start(ap, fmt); | ||
1523 | ret = oid_aprintf_vargs(oid, gfp, fmt, ap); | ||
1524 | va_end(ap); | ||
1525 | |||
1526 | return ret; | ||
1527 | } | ||
1528 | EXPORT_SYMBOL(ceph_oid_aprintf); | ||
1529 | |||
1530 | void ceph_oid_destroy(struct ceph_object_id *oid) | ||
1531 | { | ||
1532 | if (oid->name != oid->inline_name) | ||
1533 | kfree(oid->name); | ||
1534 | } | ||
1535 | EXPORT_SYMBOL(ceph_oid_destroy); | ||
1536 | |||
1537 | /* | ||
1538 | * osds only | ||
1539 | */ | ||
1540 | static bool __osds_equal(const struct ceph_osds *lhs, | ||
1541 | const struct ceph_osds *rhs) | ||
1542 | { | ||
1543 | if (lhs->size == rhs->size && | ||
1544 | !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0]))) | ||
1545 | return true; | ||
1546 | |||
1547 | return false; | ||
1548 | } | ||
1549 | |||
1550 | /* | ||
1551 | * osds + primary | ||
1552 | */ | ||
1553 | static bool osds_equal(const struct ceph_osds *lhs, | ||
1554 | const struct ceph_osds *rhs) | ||
1555 | { | ||
1556 | if (__osds_equal(lhs, rhs) && | ||
1557 | lhs->primary == rhs->primary) | ||
1558 | return true; | ||
1559 | |||
1560 | return false; | ||
1561 | } | ||
1562 | |||
1563 | static bool osds_valid(const struct ceph_osds *set) | ||
1564 | { | ||
1565 | /* non-empty set */ | ||
1566 | if (set->size > 0 && set->primary >= 0) | ||
1567 | return true; | ||
1568 | |||
1569 | /* empty can_shift_osds set */ | ||
1570 | if (!set->size && set->primary == -1) | ||
1571 | return true; | ||
1572 | |||
1573 | /* empty !can_shift_osds set - all NONE */ | ||
1574 | if (set->size > 0 && set->primary == -1) { | ||
1575 | int i; | ||
1576 | |||
1577 | for (i = 0; i < set->size; i++) { | ||
1578 | if (set->osds[i] != CRUSH_ITEM_NONE) | ||
1579 | break; | ||
1580 | } | ||
1581 | if (i == set->size) | ||
1582 | return true; | ||
1583 | } | ||
1584 | |||
1585 | return false; | ||
1586 | } | ||
1587 | |||
1588 | void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src) | ||
1589 | { | ||
1590 | memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0])); | ||
1591 | dest->size = src->size; | ||
1592 | dest->primary = src->primary; | ||
1593 | } | ||
1594 | |||
1595 | static bool is_split(const struct ceph_pg *pgid, | ||
1596 | u32 old_pg_num, | ||
1597 | u32 new_pg_num) | ||
1598 | { | ||
1599 | int old_bits = calc_bits_of(old_pg_num); | ||
1600 | int old_mask = (1 << old_bits) - 1; | ||
1601 | int n; | ||
1602 | |||
1603 | WARN_ON(pgid->seed >= old_pg_num); | ||
1604 | if (new_pg_num <= old_pg_num) | ||
1605 | return false; | ||
1606 | |||
1607 | for (n = 1; ; n++) { | ||
1608 | int next_bit = n << (old_bits - 1); | ||
1609 | u32 s = next_bit | pgid->seed; | ||
1610 | |||
1611 | if (s < old_pg_num || s == pgid->seed) | ||
1612 | continue; | ||
1613 | if (s >= new_pg_num) | ||
1614 | break; | ||
1615 | |||
1616 | s = ceph_stable_mod(s, old_pg_num, old_mask); | ||
1617 | if (s == pgid->seed) | ||
1618 | return true; | ||
1619 | } | ||
1620 | |||
1621 | return false; | ||
1622 | } | ||
1623 | |||
1624 | bool ceph_is_new_interval(const struct ceph_osds *old_acting, | ||
1625 | const struct ceph_osds *new_acting, | ||
1626 | const struct ceph_osds *old_up, | ||
1627 | const struct ceph_osds *new_up, | ||
1628 | int old_size, | ||
1629 | int new_size, | ||
1630 | int old_min_size, | ||
1631 | int new_min_size, | ||
1632 | u32 old_pg_num, | ||
1633 | u32 new_pg_num, | ||
1634 | bool old_sort_bitwise, | ||
1635 | bool new_sort_bitwise, | ||
1636 | const struct ceph_pg *pgid) | ||
1637 | { | ||
1638 | return !osds_equal(old_acting, new_acting) || | ||
1639 | !osds_equal(old_up, new_up) || | ||
1640 | old_size != new_size || | ||
1641 | old_min_size != new_min_size || | ||
1642 | is_split(pgid, old_pg_num, new_pg_num) || | ||
1643 | old_sort_bitwise != new_sort_bitwise; | ||
1644 | } | ||
1645 | |||
1646 | static int calc_pg_rank(int osd, const struct ceph_osds *acting) | ||
1647 | { | ||
1648 | int i; | ||
1649 | |||
1650 | for (i = 0; i < acting->size; i++) { | ||
1651 | if (acting->osds[i] == osd) | ||
1652 | return i; | ||
1653 | } | ||
1654 | |||
1655 | return -1; | ||
1656 | } | ||
1657 | |||
1658 | static bool primary_changed(const struct ceph_osds *old_acting, | ||
1659 | const struct ceph_osds *new_acting) | ||
1660 | { | ||
1661 | if (!old_acting->size && !new_acting->size) | ||
1662 | return false; /* both still empty */ | ||
1384 | 1663 | ||
1664 | if (!old_acting->size ^ !new_acting->size) | ||
1665 | return true; /* was empty, now not, or vice versa */ | ||
1385 | 1666 | ||
1667 | if (old_acting->primary != new_acting->primary) | ||
1668 | return true; /* primary changed */ | ||
1669 | |||
1670 | if (calc_pg_rank(old_acting->primary, old_acting) != | ||
1671 | calc_pg_rank(new_acting->primary, new_acting)) | ||
1672 | return true; | ||
1673 | |||
1674 | return false; /* same primary (tho replicas may have changed) */ | ||
1675 | } | ||
1676 | |||
1677 | bool ceph_osds_changed(const struct ceph_osds *old_acting, | ||
1678 | const struct ceph_osds *new_acting, | ||
1679 | bool any_change) | ||
1680 | { | ||
1681 | if (primary_changed(old_acting, new_acting)) | ||
1682 | return true; | ||
1683 | |||
1684 | if (any_change && !__osds_equal(old_acting, new_acting)) | ||
1685 | return true; | ||
1686 | |||
1687 | return false; | ||
1688 | } | ||
1386 | 1689 | ||
1387 | /* | 1690 | /* |
1388 | * calculate file layout from given offset, length. | 1691 | * calculate file layout from given offset, length. |
@@ -1455,30 +1758,71 @@ invalid: | |||
1455 | EXPORT_SYMBOL(ceph_calc_file_object_mapping); | 1758 | EXPORT_SYMBOL(ceph_calc_file_object_mapping); |
1456 | 1759 | ||
1457 | /* | 1760 | /* |
1458 | * Calculate mapping of a (oloc, oid) pair to a PG. Should only be | 1761 | * Map an object into a PG. |
1459 | * called with target's (oloc, oid), since tiering isn't taken into | 1762 | * |
1460 | * account. | 1763 | * Should only be called with target_oid and target_oloc (as opposed to |
1764 | * base_oid and base_oloc), since tiering isn't taken into account. | ||
1461 | */ | 1765 | */ |
1462 | int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, | 1766 | int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, |
1463 | struct ceph_object_locator *oloc, | 1767 | struct ceph_object_id *oid, |
1464 | struct ceph_object_id *oid, | 1768 | struct ceph_object_locator *oloc, |
1465 | struct ceph_pg *pg_out) | 1769 | struct ceph_pg *raw_pgid) |
1466 | { | 1770 | { |
1467 | struct ceph_pg_pool_info *pi; | 1771 | struct ceph_pg_pool_info *pi; |
1468 | 1772 | ||
1469 | pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); | 1773 | pi = ceph_pg_pool_by_id(osdmap, oloc->pool); |
1470 | if (!pi) | 1774 | if (!pi) |
1471 | return -EIO; | 1775 | return -ENOENT; |
1472 | 1776 | ||
1473 | pg_out->pool = oloc->pool; | 1777 | raw_pgid->pool = oloc->pool; |
1474 | pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, | 1778 | raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, |
1475 | oid->name_len); | 1779 | oid->name_len); |
1476 | 1780 | ||
1477 | dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, | 1781 | dout("%s %*pE -> raw_pgid %llu.%x\n", __func__, oid->name_len, |
1478 | pg_out->pool, pg_out->seed); | 1782 | oid->name, raw_pgid->pool, raw_pgid->seed); |
1479 | return 0; | 1783 | return 0; |
1480 | } | 1784 | } |
1481 | EXPORT_SYMBOL(ceph_oloc_oid_to_pg); | 1785 | EXPORT_SYMBOL(ceph_object_locator_to_pg); |
1786 | |||
1787 | /* | ||
1788 | * Map a raw PG (full precision ps) into an actual PG. | ||
1789 | */ | ||
1790 | static void raw_pg_to_pg(struct ceph_pg_pool_info *pi, | ||
1791 | const struct ceph_pg *raw_pgid, | ||
1792 | struct ceph_pg *pgid) | ||
1793 | { | ||
1794 | pgid->pool = raw_pgid->pool; | ||
1795 | pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num, | ||
1796 | pi->pg_num_mask); | ||
1797 | } | ||
1798 | |||
1799 | /* | ||
1800 | * Map a raw PG (full precision ps) into a placement ps (placement | ||
1801 | * seed). Include pool id in that value so that different pools don't | ||
1802 | * use the same seeds. | ||
1803 | */ | ||
1804 | static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi, | ||
1805 | const struct ceph_pg *raw_pgid) | ||
1806 | { | ||
1807 | if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) { | ||
1808 | /* hash pool id and seed so that pool PGs do not overlap */ | ||
1809 | return crush_hash32_2(CRUSH_HASH_RJENKINS1, | ||
1810 | ceph_stable_mod(raw_pgid->seed, | ||
1811 | pi->pgp_num, | ||
1812 | pi->pgp_num_mask), | ||
1813 | raw_pgid->pool); | ||
1814 | } else { | ||
1815 | /* | ||
1816 | * legacy behavior: add ps and pool together. this is | ||
1817 | * not a great approach because the PGs from each pool | ||
1818 | * will overlap on top of each other: 0.5 == 1.4 == | ||
1819 | * 2.3 == ... | ||
1820 | */ | ||
1821 | return ceph_stable_mod(raw_pgid->seed, pi->pgp_num, | ||
1822 | pi->pgp_num_mask) + | ||
1823 | (unsigned)raw_pgid->pool; | ||
1824 | } | ||
1825 | } | ||
1482 | 1826 | ||
1483 | static int do_crush(struct ceph_osdmap *map, int ruleno, int x, | 1827 | static int do_crush(struct ceph_osdmap *map, int ruleno, int x, |
1484 | int *result, int result_max, | 1828 | int *result, int result_max, |
@@ -1497,84 +1841,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, | |||
1497 | } | 1841 | } |
1498 | 1842 | ||
1499 | /* | 1843 | /* |
1500 | * Calculate raw (crush) set for given pgid. | 1844 | * Calculate raw set (CRUSH output) for given PG. The result may |
1845 | * contain nonexistent OSDs. ->primary is undefined for a raw set. | ||
1501 | * | 1846 | * |
1502 | * Return raw set length, or error. | 1847 | * Placement seed (CRUSH input) is returned through @ppps. |
1503 | */ | 1848 | */ |
1504 | static int pg_to_raw_osds(struct ceph_osdmap *osdmap, | 1849 | static void pg_to_raw_osds(struct ceph_osdmap *osdmap, |
1505 | struct ceph_pg_pool_info *pool, | 1850 | struct ceph_pg_pool_info *pi, |
1506 | struct ceph_pg pgid, u32 pps, int *osds) | 1851 | const struct ceph_pg *raw_pgid, |
1852 | struct ceph_osds *raw, | ||
1853 | u32 *ppps) | ||
1507 | { | 1854 | { |
1855 | u32 pps = raw_pg_to_pps(pi, raw_pgid); | ||
1508 | int ruleno; | 1856 | int ruleno; |
1509 | int len; | 1857 | int len; |
1510 | 1858 | ||
1511 | /* crush */ | 1859 | ceph_osds_init(raw); |
1512 | ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, | 1860 | if (ppps) |
1513 | pool->type, pool->size); | 1861 | *ppps = pps; |
1862 | |||
1863 | ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type, | ||
1864 | pi->size); | ||
1514 | if (ruleno < 0) { | 1865 | if (ruleno < 0) { |
1515 | pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", | 1866 | pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", |
1516 | pgid.pool, pool->crush_ruleset, pool->type, | 1867 | pi->id, pi->crush_ruleset, pi->type, pi->size); |
1517 | pool->size); | 1868 | return; |
1518 | return -ENOENT; | ||
1519 | } | 1869 | } |
1520 | 1870 | ||
1521 | len = do_crush(osdmap, ruleno, pps, osds, | 1871 | len = do_crush(osdmap, ruleno, pps, raw->osds, |
1522 | min_t(int, pool->size, CEPH_PG_MAX_SIZE), | 1872 | min_t(int, pi->size, ARRAY_SIZE(raw->osds)), |
1523 | osdmap->osd_weight, osdmap->max_osd); | 1873 | osdmap->osd_weight, osdmap->max_osd); |
1524 | if (len < 0) { | 1874 | if (len < 0) { |
1525 | pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", | 1875 | pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", |
1526 | len, ruleno, pgid.pool, pool->crush_ruleset, | 1876 | len, ruleno, pi->id, pi->crush_ruleset, pi->type, |
1527 | pool->type, pool->size); | 1877 | pi->size); |
1528 | return len; | 1878 | return; |
1529 | } | 1879 | } |
1530 | 1880 | ||
1531 | return len; | 1881 | raw->size = len; |
1532 | } | 1882 | } |
1533 | 1883 | ||
1534 | /* | 1884 | /* |
1535 | * Given raw set, calculate up set and up primary. | 1885 | * Given raw set, calculate up set and up primary. By definition of an |
1886 | * up set, the result won't contain nonexistent or down OSDs. | ||
1536 | * | 1887 | * |
1537 | * Return up set length. *primary is set to up primary osd id, or -1 | 1888 | * This is done in-place - on return @set is the up set. If it's |
1538 | * if up set is empty. | 1889 | * empty, ->primary will remain undefined. |
1539 | */ | 1890 | */ |
1540 | static int raw_to_up_osds(struct ceph_osdmap *osdmap, | 1891 | static void raw_to_up_osds(struct ceph_osdmap *osdmap, |
1541 | struct ceph_pg_pool_info *pool, | 1892 | struct ceph_pg_pool_info *pi, |
1542 | int *osds, int len, int *primary) | 1893 | struct ceph_osds *set) |
1543 | { | 1894 | { |
1544 | int up_primary = -1; | ||
1545 | int i; | 1895 | int i; |
1546 | 1896 | ||
1547 | if (ceph_can_shift_osds(pool)) { | 1897 | /* ->primary is undefined for a raw set */ |
1898 | BUG_ON(set->primary != -1); | ||
1899 | |||
1900 | if (ceph_can_shift_osds(pi)) { | ||
1548 | int removed = 0; | 1901 | int removed = 0; |
1549 | 1902 | ||
1550 | for (i = 0; i < len; i++) { | 1903 | /* shift left */ |
1551 | if (ceph_osd_is_down(osdmap, osds[i])) { | 1904 | for (i = 0; i < set->size; i++) { |
1905 | if (ceph_osd_is_down(osdmap, set->osds[i])) { | ||
1552 | removed++; | 1906 | removed++; |
1553 | continue; | 1907 | continue; |
1554 | } | 1908 | } |
1555 | if (removed) | 1909 | if (removed) |
1556 | osds[i - removed] = osds[i]; | 1910 | set->osds[i - removed] = set->osds[i]; |
1557 | } | 1911 | } |
1558 | 1912 | set->size -= removed; | |
1559 | len -= removed; | 1913 | if (set->size > 0) |
1560 | if (len > 0) | 1914 | set->primary = set->osds[0]; |
1561 | up_primary = osds[0]; | ||
1562 | } else { | 1915 | } else { |
1563 | for (i = len - 1; i >= 0; i--) { | 1916 | /* set down/dne devices to NONE */ |
1564 | if (ceph_osd_is_down(osdmap, osds[i])) | 1917 | for (i = set->size - 1; i >= 0; i--) { |
1565 | osds[i] = CRUSH_ITEM_NONE; | 1918 | if (ceph_osd_is_down(osdmap, set->osds[i])) |
1919 | set->osds[i] = CRUSH_ITEM_NONE; | ||
1566 | else | 1920 | else |
1567 | up_primary = osds[i]; | 1921 | set->primary = set->osds[i]; |
1568 | } | 1922 | } |
1569 | } | 1923 | } |
1570 | |||
1571 | *primary = up_primary; | ||
1572 | return len; | ||
1573 | } | 1924 | } |
1574 | 1925 | ||
1575 | static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, | 1926 | static void apply_primary_affinity(struct ceph_osdmap *osdmap, |
1576 | struct ceph_pg_pool_info *pool, | 1927 | struct ceph_pg_pool_info *pi, |
1577 | int *osds, int len, int *primary) | 1928 | u32 pps, |
1929 | struct ceph_osds *up) | ||
1578 | { | 1930 | { |
1579 | int i; | 1931 | int i; |
1580 | int pos = -1; | 1932 | int pos = -1; |
@@ -1586,8 +1938,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, | |||
1586 | if (!osdmap->osd_primary_affinity) | 1938 | if (!osdmap->osd_primary_affinity) |
1587 | return; | 1939 | return; |
1588 | 1940 | ||
1589 | for (i = 0; i < len; i++) { | 1941 | for (i = 0; i < up->size; i++) { |
1590 | int osd = osds[i]; | 1942 | int osd = up->osds[i]; |
1591 | 1943 | ||
1592 | if (osd != CRUSH_ITEM_NONE && | 1944 | if (osd != CRUSH_ITEM_NONE && |
1593 | osdmap->osd_primary_affinity[osd] != | 1945 | osdmap->osd_primary_affinity[osd] != |
@@ -1595,7 +1947,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, | |||
1595 | break; | 1947 | break; |
1596 | } | 1948 | } |
1597 | } | 1949 | } |
1598 | if (i == len) | 1950 | if (i == up->size) |
1599 | return; | 1951 | return; |
1600 | 1952 | ||
1601 | /* | 1953 | /* |
@@ -1603,8 +1955,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, | |||
1603 | * osd into the hash/rng so that a proportional fraction of an | 1955 | * osd into the hash/rng so that a proportional fraction of an |
1604 | * osd's pgs get rejected as primary. | 1956 | * osd's pgs get rejected as primary. |
1605 | */ | 1957 | */ |
1606 | for (i = 0; i < len; i++) { | 1958 | for (i = 0; i < up->size; i++) { |
1607 | int osd = osds[i]; | 1959 | int osd = up->osds[i]; |
1608 | u32 aff; | 1960 | u32 aff; |
1609 | 1961 | ||
1610 | if (osd == CRUSH_ITEM_NONE) | 1962 | if (osd == CRUSH_ITEM_NONE) |
@@ -1629,135 +1981,110 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, | |||
1629 | if (pos < 0) | 1981 | if (pos < 0) |
1630 | return; | 1982 | return; |
1631 | 1983 | ||
1632 | *primary = osds[pos]; | 1984 | up->primary = up->osds[pos]; |
1633 | 1985 | ||
1634 | if (ceph_can_shift_osds(pool) && pos > 0) { | 1986 | if (ceph_can_shift_osds(pi) && pos > 0) { |
1635 | /* move the new primary to the front */ | 1987 | /* move the new primary to the front */ |
1636 | for (i = pos; i > 0; i--) | 1988 | for (i = pos; i > 0; i--) |
1637 | osds[i] = osds[i - 1]; | 1989 | up->osds[i] = up->osds[i - 1]; |
1638 | osds[0] = *primary; | 1990 | up->osds[0] = up->primary; |
1639 | } | 1991 | } |
1640 | } | 1992 | } |
1641 | 1993 | ||
1642 | /* | 1994 | /* |
1643 | * Given up set, apply pg_temp and primary_temp mappings. | 1995 | * Get pg_temp and primary_temp mappings for given PG. |
1644 | * | 1996 | * |
1645 | * Return acting set length. *primary is set to acting primary osd id, | 1997 | * Note that a PG may have none, only pg_temp, only primary_temp or |
1646 | * or -1 if acting set is empty. | 1998 | * both pg_temp and primary_temp mappings. This means @temp isn't |
1999 | * always a valid OSD set on return: in the "only primary_temp" case, | ||
2000 | * @temp will have its ->primary >= 0 but ->size == 0. | ||
1647 | */ | 2001 | */ |
1648 | static int apply_temps(struct ceph_osdmap *osdmap, | 2002 | static void get_temp_osds(struct ceph_osdmap *osdmap, |
1649 | struct ceph_pg_pool_info *pool, struct ceph_pg pgid, | 2003 | struct ceph_pg_pool_info *pi, |
1650 | int *osds, int len, int *primary) | 2004 | const struct ceph_pg *raw_pgid, |
2005 | struct ceph_osds *temp) | ||
1651 | { | 2006 | { |
2007 | struct ceph_pg pgid; | ||
1652 | struct ceph_pg_mapping *pg; | 2008 | struct ceph_pg_mapping *pg; |
1653 | int temp_len; | ||
1654 | int temp_primary; | ||
1655 | int i; | 2009 | int i; |
1656 | 2010 | ||
1657 | /* raw_pg -> pg */ | 2011 | raw_pg_to_pg(pi, raw_pgid, &pgid); |
1658 | pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, | 2012 | ceph_osds_init(temp); |
1659 | pool->pg_num_mask); | ||
1660 | 2013 | ||
1661 | /* pg_temp? */ | 2014 | /* pg_temp? */ |
1662 | pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); | 2015 | pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); |
1663 | if (pg) { | 2016 | if (pg) { |
1664 | temp_len = 0; | ||
1665 | temp_primary = -1; | ||
1666 | |||
1667 | for (i = 0; i < pg->pg_temp.len; i++) { | 2017 | for (i = 0; i < pg->pg_temp.len; i++) { |
1668 | if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { | 2018 | if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { |
1669 | if (ceph_can_shift_osds(pool)) | 2019 | if (ceph_can_shift_osds(pi)) |
1670 | continue; | 2020 | continue; |
1671 | else | 2021 | |
1672 | osds[temp_len++] = CRUSH_ITEM_NONE; | 2022 | temp->osds[temp->size++] = CRUSH_ITEM_NONE; |
1673 | } else { | 2023 | } else { |
1674 | osds[temp_len++] = pg->pg_temp.osds[i]; | 2024 | temp->osds[temp->size++] = pg->pg_temp.osds[i]; |
1675 | } | 2025 | } |
1676 | } | 2026 | } |
1677 | 2027 | ||
1678 | /* apply pg_temp's primary */ | 2028 | /* apply pg_temp's primary */ |
1679 | for (i = 0; i < temp_len; i++) { | 2029 | for (i = 0; i < temp->size; i++) { |
1680 | if (osds[i] != CRUSH_ITEM_NONE) { | 2030 | if (temp->osds[i] != CRUSH_ITEM_NONE) { |
1681 | temp_primary = osds[i]; | 2031 | temp->primary = temp->osds[i]; |
1682 | break; | 2032 | break; |
1683 | } | 2033 | } |
1684 | } | 2034 | } |
1685 | } else { | ||
1686 | temp_len = len; | ||
1687 | temp_primary = *primary; | ||
1688 | } | 2035 | } |
1689 | 2036 | ||
1690 | /* primary_temp? */ | 2037 | /* primary_temp? */ |
1691 | pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); | 2038 | pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); |
1692 | if (pg) | 2039 | if (pg) |
1693 | temp_primary = pg->primary_temp.osd; | 2040 | temp->primary = pg->primary_temp.osd; |
1694 | |||
1695 | *primary = temp_primary; | ||
1696 | return temp_len; | ||
1697 | } | 2041 | } |
1698 | 2042 | ||
1699 | /* | 2043 | /* |
1700 | * Calculate acting set for given pgid. | 2044 | * Map a PG to its acting set as well as its up set. |
1701 | * | 2045 | * |
1702 | * Return acting set length, or error. *primary is set to acting | 2046 | * Acting set is used for data mapping purposes, while up set can be |
1703 | * primary osd id, or -1 if acting set is empty or on error. | 2047 | * recorded for detecting interval changes and deciding whether to |
2048 | * resend a request. | ||
1704 | */ | 2049 | */ |
1705 | int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, | 2050 | void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, |
1706 | int *osds, int *primary) | 2051 | const struct ceph_pg *raw_pgid, |
2052 | struct ceph_osds *up, | ||
2053 | struct ceph_osds *acting) | ||
1707 | { | 2054 | { |
1708 | struct ceph_pg_pool_info *pool; | 2055 | struct ceph_pg_pool_info *pi; |
1709 | u32 pps; | 2056 | u32 pps; |
1710 | int len; | ||
1711 | 2057 | ||
1712 | pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); | 2058 | pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool); |
1713 | if (!pool) { | 2059 | if (!pi) { |
1714 | *primary = -1; | 2060 | ceph_osds_init(up); |
1715 | return -ENOENT; | 2061 | ceph_osds_init(acting); |
2062 | goto out; | ||
1716 | } | 2063 | } |
1717 | 2064 | ||
1718 | if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { | 2065 | pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps); |
1719 | /* hash pool id and seed so that pool PGs do not overlap */ | 2066 | raw_to_up_osds(osdmap, pi, up); |
1720 | pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, | 2067 | apply_primary_affinity(osdmap, pi, pps, up); |
1721 | ceph_stable_mod(pgid.seed, pool->pgp_num, | 2068 | get_temp_osds(osdmap, pi, raw_pgid, acting); |
1722 | pool->pgp_num_mask), | 2069 | if (!acting->size) { |
1723 | pgid.pool); | 2070 | memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0])); |
1724 | } else { | 2071 | acting->size = up->size; |
1725 | /* | 2072 | if (acting->primary == -1) |
1726 | * legacy behavior: add ps and pool together. this is | 2073 | acting->primary = up->primary; |
1727 | * not a great approach because the PGs from each pool | ||
1728 | * will overlap on top of each other: 0.5 == 1.4 == | ||
1729 | * 2.3 == ... | ||
1730 | */ | ||
1731 | pps = ceph_stable_mod(pgid.seed, pool->pgp_num, | ||
1732 | pool->pgp_num_mask) + | ||
1733 | (unsigned)pgid.pool; | ||
1734 | } | ||
1735 | |||
1736 | len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); | ||
1737 | if (len < 0) { | ||
1738 | *primary = -1; | ||
1739 | return len; | ||
1740 | } | 2074 | } |
1741 | 2075 | out: | |
1742 | len = raw_to_up_osds(osdmap, pool, osds, len, primary); | 2076 | WARN_ON(!osds_valid(up) || !osds_valid(acting)); |
1743 | |||
1744 | apply_primary_affinity(osdmap, pps, pool, osds, len, primary); | ||
1745 | |||
1746 | len = apply_temps(osdmap, pool, pgid, osds, len, primary); | ||
1747 | |||
1748 | return len; | ||
1749 | } | 2077 | } |
1750 | 2078 | ||
1751 | /* | 2079 | /* |
1752 | * Return primary osd for given pgid, or -1 if none. | 2080 | * Return acting primary for given PG, or -1 if none. |
1753 | */ | 2081 | */ |
1754 | int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) | 2082 | int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, |
2083 | const struct ceph_pg *raw_pgid) | ||
1755 | { | 2084 | { |
1756 | int osds[CEPH_PG_MAX_SIZE]; | 2085 | struct ceph_osds up, acting; |
1757 | int primary; | ||
1758 | |||
1759 | ceph_calc_pg_acting(osdmap, pgid, osds, &primary); | ||
1760 | 2086 | ||
1761 | return primary; | 2087 | ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting); |
2088 | return acting.primary; | ||
1762 | } | 2089 | } |
1763 | EXPORT_SYMBOL(ceph_calc_pg_primary); | 2090 | EXPORT_SYMBOL(ceph_pg_to_acting_primary); |