diff options
| -rw-r--r-- | Documentation/filesystems/ceph.txt | 5 | ||||
| -rw-r--r-- | drivers/block/rbd.c | 601 | ||||
| -rw-r--r-- | drivers/block/rbd_types.h | 10 | ||||
| -rw-r--r-- | fs/ceph/addr.c | 19 | ||||
| -rw-r--r-- | fs/ceph/cache.c | 2 | ||||
| -rw-r--r-- | fs/ceph/caps.c | 40 | ||||
| -rw-r--r-- | fs/ceph/debugfs.c | 2 | ||||
| -rw-r--r-- | fs/ceph/dir.c | 32 | ||||
| -rw-r--r-- | fs/ceph/export.c | 3 | ||||
| -rw-r--r-- | fs/ceph/file.c | 106 | ||||
| -rw-r--r-- | fs/ceph/inode.c | 172 | ||||
| -rw-r--r-- | fs/ceph/ioctl.c | 4 | ||||
| -rw-r--r-- | fs/ceph/mds_client.c | 175 | ||||
| -rw-r--r-- | fs/ceph/mds_client.h | 15 | ||||
| -rw-r--r-- | fs/ceph/super.c | 9 | ||||
| -rw-r--r-- | fs/ceph/super.h | 14 | ||||
| -rw-r--r-- | include/linux/ceph/osd_client.h | 6 | ||||
| -rw-r--r-- | include/linux/ceph/osdmap.h | 13 | ||||
| -rw-r--r-- | include/linux/ceph/rados.h | 2 | ||||
| -rw-r--r-- | include/linux/crush/crush.h | 41 | ||||
| -rw-r--r-- | include/linux/crush/mapper.h | 16 | ||||
| -rw-r--r-- | net/ceph/cls_lock_client.c | 14 | ||||
| -rw-r--r-- | net/ceph/crush/crush.c | 5 | ||||
| -rw-r--r-- | net/ceph/crush/mapper.c | 227 | ||||
| -rw-r--r-- | net/ceph/crypto.c | 1 | ||||
| -rw-r--r-- | net/ceph/osd_client.c | 130 | ||||
| -rw-r--r-- | net/ceph/osdmap.c | 101 | ||||
| -rw-r--r-- | net/ceph/snapshot.c | 2 |
28 files changed, 835 insertions, 932 deletions
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index f5306ee40ea9..0b302a11718a 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt | |||
| @@ -98,11 +98,10 @@ Mount Options | |||
| 98 | size. | 98 | size. |
| 99 | 99 | ||
| 100 | rsize=X | 100 | rsize=X |
| 101 | Specify the maximum read size in bytes. By default there is no | 101 | Specify the maximum read size in bytes. Default: 64 MB. |
| 102 | maximum. | ||
| 103 | 102 | ||
| 104 | rasize=X | 103 | rasize=X |
| 105 | Specify the maximum readahead. | 104 | Specify the maximum readahead. Default: 8 MB. |
| 106 | 105 | ||
| 107 | mount_timeout=X | 106 | mount_timeout=X |
| 108 | Specify the timeout value for mount (in seconds), in the case | 107 | Specify the timeout value for mount (in seconds), in the case |
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 362cecc77130..4d6807723798 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c | |||
| @@ -123,9 +123,11 @@ static int atomic_dec_return_safe(atomic_t *v) | |||
| 123 | #define RBD_FEATURE_LAYERING (1<<0) | 123 | #define RBD_FEATURE_LAYERING (1<<0) |
| 124 | #define RBD_FEATURE_STRIPINGV2 (1<<1) | 124 | #define RBD_FEATURE_STRIPINGV2 (1<<1) |
| 125 | #define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2) | 125 | #define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2) |
| 126 | #define RBD_FEATURE_DATA_POOL (1<<7) | ||
| 126 | #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ | 127 | #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ |
| 127 | RBD_FEATURE_STRIPINGV2 | \ | 128 | RBD_FEATURE_STRIPINGV2 | \ |
| 128 | RBD_FEATURE_EXCLUSIVE_LOCK) | 129 | RBD_FEATURE_EXCLUSIVE_LOCK | \ |
| 130 | RBD_FEATURE_DATA_POOL) | ||
| 129 | 131 | ||
| 130 | /* Features supported by this (client software) implementation. */ | 132 | /* Features supported by this (client software) implementation. */ |
| 131 | 133 | ||
| @@ -144,10 +146,9 @@ struct rbd_image_header { | |||
| 144 | /* These six fields never change for a given rbd image */ | 146 | /* These six fields never change for a given rbd image */ |
| 145 | char *object_prefix; | 147 | char *object_prefix; |
| 146 | __u8 obj_order; | 148 | __u8 obj_order; |
| 147 | __u8 crypt_type; | ||
| 148 | __u8 comp_type; | ||
| 149 | u64 stripe_unit; | 149 | u64 stripe_unit; |
| 150 | u64 stripe_count; | 150 | u64 stripe_count; |
| 151 | s64 data_pool_id; | ||
| 151 | u64 features; /* Might be changeable someday? */ | 152 | u64 features; /* Might be changeable someday? */ |
| 152 | 153 | ||
| 153 | /* The remaining fields need to be updated occasionally */ | 154 | /* The remaining fields need to be updated occasionally */ |
| @@ -230,7 +231,7 @@ enum obj_req_flags { | |||
| 230 | }; | 231 | }; |
| 231 | 232 | ||
| 232 | struct rbd_obj_request { | 233 | struct rbd_obj_request { |
| 233 | const char *object_name; | 234 | u64 object_no; |
| 234 | u64 offset; /* object start byte */ | 235 | u64 offset; /* object start byte */ |
| 235 | u64 length; /* bytes from offset */ | 236 | u64 length; /* bytes from offset */ |
| 236 | unsigned long flags; | 237 | unsigned long flags; |
| @@ -438,7 +439,6 @@ static DEFINE_SPINLOCK(rbd_client_list_lock); | |||
| 438 | 439 | ||
| 439 | static struct kmem_cache *rbd_img_request_cache; | 440 | static struct kmem_cache *rbd_img_request_cache; |
| 440 | static struct kmem_cache *rbd_obj_request_cache; | 441 | static struct kmem_cache *rbd_obj_request_cache; |
| 441 | static struct kmem_cache *rbd_segment_name_cache; | ||
| 442 | 442 | ||
| 443 | static int rbd_major; | 443 | static int rbd_major; |
| 444 | static DEFINE_IDA(rbd_dev_id_ida); | 444 | static DEFINE_IDA(rbd_dev_id_ida); |
| @@ -973,6 +973,30 @@ static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) | |||
| 973 | } | 973 | } |
| 974 | 974 | ||
| 975 | /* | 975 | /* |
| 976 | * returns the size of an object in the image | ||
| 977 | */ | ||
| 978 | static u32 rbd_obj_bytes(struct rbd_image_header *header) | ||
| 979 | { | ||
| 980 | return 1U << header->obj_order; | ||
| 981 | } | ||
| 982 | |||
| 983 | static void rbd_init_layout(struct rbd_device *rbd_dev) | ||
| 984 | { | ||
| 985 | if (rbd_dev->header.stripe_unit == 0 || | ||
| 986 | rbd_dev->header.stripe_count == 0) { | ||
| 987 | rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header); | ||
| 988 | rbd_dev->header.stripe_count = 1; | ||
| 989 | } | ||
| 990 | |||
| 991 | rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit; | ||
| 992 | rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count; | ||
| 993 | rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header); | ||
| 994 | rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ? | ||
| 995 | rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id; | ||
| 996 | RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); | ||
| 997 | } | ||
| 998 | |||
| 999 | /* | ||
| 976 | * Fill an rbd image header with information from the given format 1 | 1000 | * Fill an rbd image header with information from the given format 1 |
| 977 | * on-disk header. | 1001 | * on-disk header. |
| 978 | */ | 1002 | */ |
| @@ -992,15 +1016,11 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev, | |||
| 992 | /* Allocate this now to avoid having to handle failure below */ | 1016 | /* Allocate this now to avoid having to handle failure below */ |
| 993 | 1017 | ||
| 994 | if (first_time) { | 1018 | if (first_time) { |
| 995 | size_t len; | 1019 | object_prefix = kstrndup(ondisk->object_prefix, |
| 996 | 1020 | sizeof(ondisk->object_prefix), | |
| 997 | len = strnlen(ondisk->object_prefix, | 1021 | GFP_KERNEL); |
| 998 | sizeof (ondisk->object_prefix)); | ||
| 999 | object_prefix = kmalloc(len + 1, GFP_KERNEL); | ||
| 1000 | if (!object_prefix) | 1022 | if (!object_prefix) |
| 1001 | return -ENOMEM; | 1023 | return -ENOMEM; |
| 1002 | memcpy(object_prefix, ondisk->object_prefix, len); | ||
| 1003 | object_prefix[len] = '\0'; | ||
| 1004 | } | 1024 | } |
| 1005 | 1025 | ||
| 1006 | /* Allocate the snapshot context and fill it in */ | 1026 | /* Allocate the snapshot context and fill it in */ |
| @@ -1051,12 +1071,7 @@ static int rbd_header_from_disk(struct rbd_device *rbd_dev, | |||
| 1051 | if (first_time) { | 1071 | if (first_time) { |
| 1052 | header->object_prefix = object_prefix; | 1072 | header->object_prefix = object_prefix; |
| 1053 | header->obj_order = ondisk->options.order; | 1073 | header->obj_order = ondisk->options.order; |
| 1054 | header->crypt_type = ondisk->options.crypt_type; | 1074 | rbd_init_layout(rbd_dev); |
| 1055 | header->comp_type = ondisk->options.comp_type; | ||
| 1056 | /* The rest aren't used for format 1 images */ | ||
| 1057 | header->stripe_unit = 0; | ||
| 1058 | header->stripe_count = 0; | ||
| 1059 | header->features = 0; | ||
| 1060 | } else { | 1075 | } else { |
| 1061 | ceph_put_snap_context(header->snapc); | 1076 | ceph_put_snap_context(header->snapc); |
| 1062 | kfree(header->snap_names); | 1077 | kfree(header->snap_names); |
| @@ -1232,42 +1247,9 @@ static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) | |||
| 1232 | rbd_dev->mapping.features = 0; | 1247 | rbd_dev->mapping.features = 0; |
| 1233 | } | 1248 | } |
| 1234 | 1249 | ||
| 1235 | static void rbd_segment_name_free(const char *name) | ||
| 1236 | { | ||
| 1237 | /* The explicit cast here is needed to drop the const qualifier */ | ||
| 1238 | |||
| 1239 | kmem_cache_free(rbd_segment_name_cache, (void *)name); | ||
| 1240 | } | ||
| 1241 | |||
| 1242 | static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) | ||
| 1243 | { | ||
| 1244 | char *name; | ||
| 1245 | u64 segment; | ||
| 1246 | int ret; | ||
| 1247 | char *name_format; | ||
| 1248 | |||
| 1249 | name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); | ||
| 1250 | if (!name) | ||
| 1251 | return NULL; | ||
| 1252 | segment = offset >> rbd_dev->header.obj_order; | ||
| 1253 | name_format = "%s.%012llx"; | ||
| 1254 | if (rbd_dev->image_format == 2) | ||
| 1255 | name_format = "%s.%016llx"; | ||
| 1256 | ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, | ||
| 1257 | rbd_dev->header.object_prefix, segment); | ||
| 1258 | if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { | ||
| 1259 | pr_err("error formatting segment name for #%llu (%d)\n", | ||
| 1260 | segment, ret); | ||
| 1261 | rbd_segment_name_free(name); | ||
| 1262 | name = NULL; | ||
| 1263 | } | ||
| 1264 | |||
| 1265 | return name; | ||
| 1266 | } | ||
| 1267 | |||
| 1268 | static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) | 1250 | static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) |
| 1269 | { | 1251 | { |
| 1270 | u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; | 1252 | u64 segment_size = rbd_obj_bytes(&rbd_dev->header); |
| 1271 | 1253 | ||
| 1272 | return offset & (segment_size - 1); | 1254 | return offset & (segment_size - 1); |
| 1273 | } | 1255 | } |
| @@ -1275,7 +1257,7 @@ static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) | |||
| 1275 | static u64 rbd_segment_length(struct rbd_device *rbd_dev, | 1257 | static u64 rbd_segment_length(struct rbd_device *rbd_dev, |
| 1276 | u64 offset, u64 length) | 1258 | u64 offset, u64 length) |
| 1277 | { | 1259 | { |
| 1278 | u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; | 1260 | u64 segment_size = rbd_obj_bytes(&rbd_dev->header); |
| 1279 | 1261 | ||
| 1280 | offset &= segment_size - 1; | 1262 | offset &= segment_size - 1; |
| 1281 | 1263 | ||
| @@ -1287,14 +1269,6 @@ static u64 rbd_segment_length(struct rbd_device *rbd_dev, | |||
| 1287 | } | 1269 | } |
| 1288 | 1270 | ||
| 1289 | /* | 1271 | /* |
| 1290 | * returns the size of an object in the image | ||
| 1291 | */ | ||
| 1292 | static u64 rbd_obj_bytes(struct rbd_image_header *header) | ||
| 1293 | { | ||
| 1294 | return 1 << header->obj_order; | ||
| 1295 | } | ||
| 1296 | |||
| 1297 | /* | ||
| 1298 | * bio helpers | 1272 | * bio helpers |
| 1299 | */ | 1273 | */ |
| 1300 | 1274 | ||
| @@ -1623,7 +1597,9 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) | |||
| 1623 | { | 1597 | { |
| 1624 | struct ceph_osd_request *osd_req = obj_request->osd_req; | 1598 | struct ceph_osd_request *osd_req = obj_request->osd_req; |
| 1625 | 1599 | ||
| 1626 | dout("%s %p osd_req %p\n", __func__, obj_request, osd_req); | 1600 | dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, |
| 1601 | obj_request, obj_request->object_no, obj_request->offset, | ||
| 1602 | obj_request->length, osd_req); | ||
| 1627 | if (obj_request_img_data_test(obj_request)) { | 1603 | if (obj_request_img_data_test(obj_request)) { |
| 1628 | WARN_ON(obj_request->callback != rbd_img_obj_callback); | 1604 | WARN_ON(obj_request->callback != rbd_img_obj_callback); |
| 1629 | rbd_img_request_get(obj_request->img_request); | 1605 | rbd_img_request_get(obj_request->img_request); |
| @@ -1631,44 +1607,6 @@ static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) | |||
| 1631 | ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); | 1607 | ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); |
| 1632 | } | 1608 | } |
| 1633 | 1609 | ||
| 1634 | static void rbd_obj_request_end(struct rbd_obj_request *obj_request) | ||
| 1635 | { | ||
| 1636 | dout("%s %p\n", __func__, obj_request); | ||
| 1637 | ceph_osdc_cancel_request(obj_request->osd_req); | ||
| 1638 | } | ||
| 1639 | |||
| 1640 | /* | ||
| 1641 | * Wait for an object request to complete. If interrupted, cancel the | ||
| 1642 | * underlying osd request. | ||
| 1643 | * | ||
| 1644 | * @timeout: in jiffies, 0 means "wait forever" | ||
| 1645 | */ | ||
| 1646 | static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request, | ||
| 1647 | unsigned long timeout) | ||
| 1648 | { | ||
| 1649 | long ret; | ||
| 1650 | |||
| 1651 | dout("%s %p\n", __func__, obj_request); | ||
| 1652 | ret = wait_for_completion_interruptible_timeout( | ||
| 1653 | &obj_request->completion, | ||
| 1654 | ceph_timeout_jiffies(timeout)); | ||
| 1655 | if (ret <= 0) { | ||
| 1656 | if (ret == 0) | ||
| 1657 | ret = -ETIMEDOUT; | ||
| 1658 | rbd_obj_request_end(obj_request); | ||
| 1659 | } else { | ||
| 1660 | ret = 0; | ||
| 1661 | } | ||
| 1662 | |||
| 1663 | dout("%s %p ret %d\n", __func__, obj_request, (int)ret); | ||
| 1664 | return ret; | ||
| 1665 | } | ||
| 1666 | |||
| 1667 | static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) | ||
| 1668 | { | ||
| 1669 | return __rbd_obj_request_wait(obj_request, 0); | ||
| 1670 | } | ||
| 1671 | |||
| 1672 | static void rbd_img_request_complete(struct rbd_img_request *img_request) | 1610 | static void rbd_img_request_complete(struct rbd_img_request *img_request) |
| 1673 | { | 1611 | { |
| 1674 | 1612 | ||
| @@ -1955,8 +1893,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) | |||
| 1955 | rbd_osd_call_callback(obj_request); | 1893 | rbd_osd_call_callback(obj_request); |
| 1956 | break; | 1894 | break; |
| 1957 | default: | 1895 | default: |
| 1958 | rbd_warn(NULL, "%s: unsupported op %hu", | 1896 | rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d", |
| 1959 | obj_request->object_name, (unsigned short) opcode); | 1897 | obj_request->object_no, opcode); |
| 1960 | break; | 1898 | break; |
| 1961 | } | 1899 | } |
| 1962 | 1900 | ||
| @@ -1980,6 +1918,40 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) | |||
| 1980 | osd_req->r_data_offset = obj_request->offset; | 1918 | osd_req->r_data_offset = obj_request->offset; |
| 1981 | } | 1919 | } |
| 1982 | 1920 | ||
| 1921 | static struct ceph_osd_request * | ||
| 1922 | __rbd_osd_req_create(struct rbd_device *rbd_dev, | ||
| 1923 | struct ceph_snap_context *snapc, | ||
| 1924 | int num_ops, unsigned int flags, | ||
| 1925 | struct rbd_obj_request *obj_request) | ||
| 1926 | { | ||
| 1927 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; | ||
| 1928 | struct ceph_osd_request *req; | ||
| 1929 | const char *name_format = rbd_dev->image_format == 1 ? | ||
| 1930 | RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; | ||
| 1931 | |||
| 1932 | req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); | ||
| 1933 | if (!req) | ||
| 1934 | return NULL; | ||
| 1935 | |||
| 1936 | req->r_flags = flags; | ||
| 1937 | req->r_callback = rbd_osd_req_callback; | ||
| 1938 | req->r_priv = obj_request; | ||
| 1939 | |||
| 1940 | req->r_base_oloc.pool = rbd_dev->layout.pool_id; | ||
| 1941 | if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, | ||
| 1942 | rbd_dev->header.object_prefix, obj_request->object_no)) | ||
| 1943 | goto err_req; | ||
| 1944 | |||
| 1945 | if (ceph_osdc_alloc_messages(req, GFP_NOIO)) | ||
| 1946 | goto err_req; | ||
| 1947 | |||
| 1948 | return req; | ||
| 1949 | |||
| 1950 | err_req: | ||
| 1951 | ceph_osdc_put_request(req); | ||
| 1952 | return NULL; | ||
| 1953 | } | ||
| 1954 | |||
| 1983 | /* | 1955 | /* |
| 1984 | * Create an osd request. A read request has one osd op (read). | 1956 | * Create an osd request. A read request has one osd op (read). |
| 1985 | * A write request has either one (watch) or two (hint+write) osd ops. | 1957 | * A write request has either one (watch) or two (hint+write) osd ops. |
| @@ -1993,8 +1965,6 @@ static struct ceph_osd_request *rbd_osd_req_create( | |||
| 1993 | struct rbd_obj_request *obj_request) | 1965 | struct rbd_obj_request *obj_request) |
| 1994 | { | 1966 | { |
| 1995 | struct ceph_snap_context *snapc = NULL; | 1967 | struct ceph_snap_context *snapc = NULL; |
| 1996 | struct ceph_osd_client *osdc; | ||
| 1997 | struct ceph_osd_request *osd_req; | ||
| 1998 | 1968 | ||
| 1999 | if (obj_request_img_data_test(obj_request) && | 1969 | if (obj_request_img_data_test(obj_request) && |
| 2000 | (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { | 1970 | (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { |
| @@ -2009,35 +1979,9 @@ static struct ceph_osd_request *rbd_osd_req_create( | |||
| 2009 | 1979 | ||
| 2010 | rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); | 1980 | rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); |
| 2011 | 1981 | ||
| 2012 | /* Allocate and initialize the request, for the num_ops ops */ | 1982 | return __rbd_osd_req_create(rbd_dev, snapc, num_ops, |
| 2013 | 1983 | (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ? | |
| 2014 | osdc = &rbd_dev->rbd_client->client->osdc; | 1984 | CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request); |
| 2015 | osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, | ||
| 2016 | GFP_NOIO); | ||
| 2017 | if (!osd_req) | ||
| 2018 | goto fail; | ||
| 2019 | |||
| 2020 | if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) | ||
| 2021 | osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; | ||
| 2022 | else | ||
| 2023 | osd_req->r_flags = CEPH_OSD_FLAG_READ; | ||
| 2024 | |||
| 2025 | osd_req->r_callback = rbd_osd_req_callback; | ||
| 2026 | osd_req->r_priv = obj_request; | ||
| 2027 | |||
| 2028 | osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; | ||
| 2029 | if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", | ||
| 2030 | obj_request->object_name)) | ||
| 2031 | goto fail; | ||
| 2032 | |||
| 2033 | if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) | ||
| 2034 | goto fail; | ||
| 2035 | |||
| 2036 | return osd_req; | ||
| 2037 | |||
| 2038 | fail: | ||
| 2039 | ceph_osdc_put_request(osd_req); | ||
| 2040 | return NULL; | ||
| 2041 | } | 1985 | } |
| 2042 | 1986 | ||
| 2043 | /* | 1987 | /* |
| @@ -2050,10 +1994,6 @@ static struct ceph_osd_request * | |||
| 2050 | rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) | 1994 | rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) |
| 2051 | { | 1995 | { |
| 2052 | struct rbd_img_request *img_request; | 1996 | struct rbd_img_request *img_request; |
| 2053 | struct ceph_snap_context *snapc; | ||
| 2054 | struct rbd_device *rbd_dev; | ||
| 2055 | struct ceph_osd_client *osdc; | ||
| 2056 | struct ceph_osd_request *osd_req; | ||
| 2057 | int num_osd_ops = 3; | 1997 | int num_osd_ops = 3; |
| 2058 | 1998 | ||
| 2059 | rbd_assert(obj_request_img_data_test(obj_request)); | 1999 | rbd_assert(obj_request_img_data_test(obj_request)); |
| @@ -2065,77 +2005,34 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) | |||
| 2065 | if (img_request_discard_test(img_request)) | 2005 | if (img_request_discard_test(img_request)) |
| 2066 | num_osd_ops = 2; | 2006 | num_osd_ops = 2; |
| 2067 | 2007 | ||
| 2068 | /* Allocate and initialize the request, for all the ops */ | 2008 | return __rbd_osd_req_create(img_request->rbd_dev, |
| 2069 | 2009 | img_request->snapc, num_osd_ops, | |
| 2070 | snapc = img_request->snapc; | 2010 | CEPH_OSD_FLAG_WRITE, obj_request); |
| 2071 | rbd_dev = img_request->rbd_dev; | ||
| 2072 | osdc = &rbd_dev->rbd_client->client->osdc; | ||
| 2073 | osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, | ||
| 2074 | false, GFP_NOIO); | ||
| 2075 | if (!osd_req) | ||
| 2076 | goto fail; | ||
| 2077 | |||
| 2078 | osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; | ||
| 2079 | osd_req->r_callback = rbd_osd_req_callback; | ||
| 2080 | osd_req->r_priv = obj_request; | ||
| 2081 | |||
| 2082 | osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id; | ||
| 2083 | if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s", | ||
| 2084 | obj_request->object_name)) | ||
| 2085 | goto fail; | ||
| 2086 | |||
| 2087 | if (ceph_osdc_alloc_messages(osd_req, GFP_NOIO)) | ||
| 2088 | goto fail; | ||
| 2089 | |||
| 2090 | return osd_req; | ||
| 2091 | |||
| 2092 | fail: | ||
| 2093 | ceph_osdc_put_request(osd_req); | ||
| 2094 | return NULL; | ||
| 2095 | } | 2011 | } |
| 2096 | 2012 | ||
| 2097 | |||
| 2098 | static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) | 2013 | static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) |
| 2099 | { | 2014 | { |
| 2100 | ceph_osdc_put_request(osd_req); | 2015 | ceph_osdc_put_request(osd_req); |
| 2101 | } | 2016 | } |
| 2102 | 2017 | ||
| 2103 | /* object_name is assumed to be a non-null pointer and NUL-terminated */ | 2018 | static struct rbd_obj_request * |
| 2104 | 2019 | rbd_obj_request_create(enum obj_request_type type) | |
| 2105 | static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, | ||
| 2106 | u64 offset, u64 length, | ||
| 2107 | enum obj_request_type type) | ||
| 2108 | { | 2020 | { |
| 2109 | struct rbd_obj_request *obj_request; | 2021 | struct rbd_obj_request *obj_request; |
| 2110 | size_t size; | ||
| 2111 | char *name; | ||
| 2112 | 2022 | ||
| 2113 | rbd_assert(obj_request_type_valid(type)); | 2023 | rbd_assert(obj_request_type_valid(type)); |
| 2114 | 2024 | ||
| 2115 | size = strlen(object_name) + 1; | ||
| 2116 | name = kmalloc(size, GFP_NOIO); | ||
| 2117 | if (!name) | ||
| 2118 | return NULL; | ||
| 2119 | |||
| 2120 | obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); | 2025 | obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); |
| 2121 | if (!obj_request) { | 2026 | if (!obj_request) |
| 2122 | kfree(name); | ||
| 2123 | return NULL; | 2027 | return NULL; |
| 2124 | } | ||
| 2125 | 2028 | ||
| 2126 | obj_request->object_name = memcpy(name, object_name, size); | ||
| 2127 | obj_request->offset = offset; | ||
| 2128 | obj_request->length = length; | ||
| 2129 | obj_request->flags = 0; | ||
| 2130 | obj_request->which = BAD_WHICH; | 2029 | obj_request->which = BAD_WHICH; |
| 2131 | obj_request->type = type; | 2030 | obj_request->type = type; |
| 2132 | INIT_LIST_HEAD(&obj_request->links); | 2031 | INIT_LIST_HEAD(&obj_request->links); |
| 2133 | init_completion(&obj_request->completion); | 2032 | init_completion(&obj_request->completion); |
| 2134 | kref_init(&obj_request->kref); | 2033 | kref_init(&obj_request->kref); |
| 2135 | 2034 | ||
| 2136 | dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, | 2035 | dout("%s %p\n", __func__, obj_request); |
| 2137 | offset, length, (int)type, obj_request); | ||
| 2138 | |||
| 2139 | return obj_request; | 2036 | return obj_request; |
| 2140 | } | 2037 | } |
| 2141 | 2038 | ||
| @@ -2170,8 +2067,6 @@ static void rbd_obj_request_destroy(struct kref *kref) | |||
| 2170 | break; | 2067 | break; |
| 2171 | } | 2068 | } |
| 2172 | 2069 | ||
| 2173 | kfree(obj_request->object_name); | ||
| 2174 | obj_request->object_name = NULL; | ||
| 2175 | kmem_cache_free(rbd_obj_request_cache, obj_request); | 2070 | kmem_cache_free(rbd_obj_request_cache, obj_request); |
| 2176 | } | 2071 | } |
| 2177 | 2072 | ||
| @@ -2546,22 +2441,18 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, | |||
| 2546 | 2441 | ||
| 2547 | while (resid) { | 2442 | while (resid) { |
| 2548 | struct ceph_osd_request *osd_req; | 2443 | struct ceph_osd_request *osd_req; |
| 2549 | const char *object_name; | 2444 | u64 object_no = img_offset >> rbd_dev->header.obj_order; |
| 2550 | u64 offset; | 2445 | u64 offset = rbd_segment_offset(rbd_dev, img_offset); |
| 2551 | u64 length; | 2446 | u64 length = rbd_segment_length(rbd_dev, img_offset, resid); |
| 2552 | 2447 | ||
| 2553 | object_name = rbd_segment_name(rbd_dev, img_offset); | 2448 | obj_request = rbd_obj_request_create(type); |
| 2554 | if (!object_name) | ||
| 2555 | goto out_unwind; | ||
| 2556 | offset = rbd_segment_offset(rbd_dev, img_offset); | ||
| 2557 | length = rbd_segment_length(rbd_dev, img_offset, resid); | ||
| 2558 | obj_request = rbd_obj_request_create(object_name, | ||
| 2559 | offset, length, type); | ||
| 2560 | /* object request has its own copy of the object name */ | ||
| 2561 | rbd_segment_name_free(object_name); | ||
| 2562 | if (!obj_request) | 2449 | if (!obj_request) |
| 2563 | goto out_unwind; | 2450 | goto out_unwind; |
| 2564 | 2451 | ||
| 2452 | obj_request->object_no = object_no; | ||
| 2453 | obj_request->offset = offset; | ||
| 2454 | obj_request->length = length; | ||
| 2455 | |||
| 2565 | /* | 2456 | /* |
| 2566 | * set obj_request->img_request before creating the | 2457 | * set obj_request->img_request before creating the |
| 2567 | * osd_request so that it gets the right snapc | 2458 | * osd_request so that it gets the right snapc |
| @@ -2771,7 +2662,7 @@ static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) | |||
| 2771 | * child image to which the original request was to be sent. | 2662 | * child image to which the original request was to be sent. |
| 2772 | */ | 2663 | */ |
| 2773 | img_offset = obj_request->img_offset - obj_request->offset; | 2664 | img_offset = obj_request->img_offset - obj_request->offset; |
| 2774 | length = (u64)1 << rbd_dev->header.obj_order; | 2665 | length = rbd_obj_bytes(&rbd_dev->header); |
| 2775 | 2666 | ||
| 2776 | /* | 2667 | /* |
| 2777 | * There is no defined parent data beyond the parent | 2668 | * There is no defined parent data beyond the parent |
| @@ -2900,11 +2791,12 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) | |||
| 2900 | size_t size; | 2791 | size_t size; |
| 2901 | int ret; | 2792 | int ret; |
| 2902 | 2793 | ||
| 2903 | stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, | 2794 | stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES); |
| 2904 | OBJ_REQUEST_PAGES); | ||
| 2905 | if (!stat_request) | 2795 | if (!stat_request) |
| 2906 | return -ENOMEM; | 2796 | return -ENOMEM; |
| 2907 | 2797 | ||
| 2798 | stat_request->object_no = obj_request->object_no; | ||
| 2799 | |||
| 2908 | stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, | 2800 | stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, |
| 2909 | stat_request); | 2801 | stat_request); |
| 2910 | if (!stat_request->osd_req) { | 2802 | if (!stat_request->osd_req) { |
| @@ -3983,17 +3875,17 @@ out: | |||
| 3983 | * returned in the outbound buffer, or a negative error code. | 3875 | * returned in the outbound buffer, or a negative error code. |
| 3984 | */ | 3876 | */ |
| 3985 | static int rbd_obj_method_sync(struct rbd_device *rbd_dev, | 3877 | static int rbd_obj_method_sync(struct rbd_device *rbd_dev, |
| 3986 | const char *object_name, | 3878 | struct ceph_object_id *oid, |
| 3987 | const char *class_name, | 3879 | struct ceph_object_locator *oloc, |
| 3988 | const char *method_name, | 3880 | const char *method_name, |
| 3989 | const void *outbound, | 3881 | const void *outbound, |
| 3990 | size_t outbound_size, | 3882 | size_t outbound_size, |
| 3991 | void *inbound, | 3883 | void *inbound, |
| 3992 | size_t inbound_size) | 3884 | size_t inbound_size) |
| 3993 | { | 3885 | { |
| 3994 | struct rbd_obj_request *obj_request; | 3886 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
| 3995 | struct page **pages; | 3887 | struct page *req_page = NULL; |
| 3996 | u32 page_count; | 3888 | struct page *reply_page; |
| 3997 | int ret; | 3889 | int ret; |
| 3998 | 3890 | ||
| 3999 | /* | 3891 | /* |
| @@ -4003,61 +3895,35 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, | |||
| 4003 | * method. Currently if this is present it will be a | 3895 | * method. Currently if this is present it will be a |
| 4004 | * snapshot id. | 3896 | * snapshot id. |
| 4005 | */ | 3897 | */ |
| 4006 | page_count = (u32)calc_pages_for(0, inbound_size); | 3898 | if (outbound) { |
| 4007 | pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); | 3899 | if (outbound_size > PAGE_SIZE) |
| 4008 | if (IS_ERR(pages)) | 3900 | return -E2BIG; |
| 4009 | return PTR_ERR(pages); | ||
| 4010 | |||
| 4011 | ret = -ENOMEM; | ||
| 4012 | obj_request = rbd_obj_request_create(object_name, 0, inbound_size, | ||
| 4013 | OBJ_REQUEST_PAGES); | ||
| 4014 | if (!obj_request) | ||
| 4015 | goto out; | ||
| 4016 | 3901 | ||
| 4017 | obj_request->pages = pages; | 3902 | req_page = alloc_page(GFP_KERNEL); |
| 4018 | obj_request->page_count = page_count; | 3903 | if (!req_page) |
| 4019 | 3904 | return -ENOMEM; | |
| 4020 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, | ||
| 4021 | obj_request); | ||
| 4022 | if (!obj_request->osd_req) | ||
| 4023 | goto out; | ||
| 4024 | |||
| 4025 | osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, | ||
| 4026 | class_name, method_name); | ||
| 4027 | if (outbound_size) { | ||
| 4028 | struct ceph_pagelist *pagelist; | ||
| 4029 | |||
| 4030 | pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); | ||
| 4031 | if (!pagelist) | ||
| 4032 | goto out; | ||
| 4033 | 3905 | ||
| 4034 | ceph_pagelist_init(pagelist); | 3906 | memcpy(page_address(req_page), outbound, outbound_size); |
| 4035 | ceph_pagelist_append(pagelist, outbound, outbound_size); | ||
| 4036 | osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, | ||
| 4037 | pagelist); | ||
| 4038 | } | 3907 | } |
| 4039 | osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, | ||
| 4040 | obj_request->pages, inbound_size, | ||
| 4041 | 0, false, false); | ||
| 4042 | |||
| 4043 | rbd_obj_request_submit(obj_request); | ||
| 4044 | ret = rbd_obj_request_wait(obj_request); | ||
| 4045 | if (ret) | ||
| 4046 | goto out; | ||
| 4047 | 3908 | ||
| 4048 | ret = obj_request->result; | 3909 | reply_page = alloc_page(GFP_KERNEL); |
| 4049 | if (ret < 0) | 3910 | if (!reply_page) { |
| 4050 | goto out; | 3911 | if (req_page) |
| 3912 | __free_page(req_page); | ||
| 3913 | return -ENOMEM; | ||
| 3914 | } | ||
| 4051 | 3915 | ||
| 4052 | rbd_assert(obj_request->xferred < (u64)INT_MAX); | 3916 | ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name, |
| 4053 | ret = (int)obj_request->xferred; | 3917 | CEPH_OSD_FLAG_READ, req_page, outbound_size, |
| 4054 | ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); | 3918 | reply_page, &inbound_size); |
| 4055 | out: | 3919 | if (!ret) { |
| 4056 | if (obj_request) | 3920 | memcpy(inbound, page_address(reply_page), inbound_size); |
| 4057 | rbd_obj_request_put(obj_request); | 3921 | ret = inbound_size; |
| 4058 | else | 3922 | } |
| 4059 | ceph_release_page_vector(pages, page_count); | ||
| 4060 | 3923 | ||
| 3924 | if (req_page) | ||
| 3925 | __free_page(req_page); | ||
| 3926 | __free_page(reply_page); | ||
| 4061 | return ret; | 3927 | return ret; |
| 4062 | } | 3928 | } |
| 4063 | 3929 | ||
| @@ -4256,63 +4122,46 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) | |||
| 4256 | } | 4122 | } |
| 4257 | 4123 | ||
| 4258 | static int rbd_obj_read_sync(struct rbd_device *rbd_dev, | 4124 | static int rbd_obj_read_sync(struct rbd_device *rbd_dev, |
| 4259 | const char *object_name, | 4125 | struct ceph_object_id *oid, |
| 4260 | u64 offset, u64 length, void *buf) | 4126 | struct ceph_object_locator *oloc, |
| 4127 | void *buf, int buf_len) | ||
| 4261 | 4128 | ||
| 4262 | { | 4129 | { |
| 4263 | struct rbd_obj_request *obj_request; | 4130 | struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; |
| 4264 | struct page **pages = NULL; | 4131 | struct ceph_osd_request *req; |
| 4265 | u32 page_count; | 4132 | struct page **pages; |
| 4266 | size_t size; | 4133 | int num_pages = calc_pages_for(0, buf_len); |
| 4267 | int ret; | 4134 | int ret; |
| 4268 | 4135 | ||
| 4269 | page_count = (u32) calc_pages_for(offset, length); | 4136 | req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); |
| 4270 | pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); | 4137 | if (!req) |
| 4271 | if (IS_ERR(pages)) | 4138 | return -ENOMEM; |
| 4272 | return PTR_ERR(pages); | ||
| 4273 | |||
| 4274 | ret = -ENOMEM; | ||
| 4275 | obj_request = rbd_obj_request_create(object_name, offset, length, | ||
| 4276 | OBJ_REQUEST_PAGES); | ||
| 4277 | if (!obj_request) | ||
| 4278 | goto out; | ||
| 4279 | |||
| 4280 | obj_request->pages = pages; | ||
| 4281 | obj_request->page_count = page_count; | ||
| 4282 | |||
| 4283 | obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, | ||
| 4284 | obj_request); | ||
| 4285 | if (!obj_request->osd_req) | ||
| 4286 | goto out; | ||
| 4287 | 4139 | ||
| 4288 | osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, | 4140 | ceph_oid_copy(&req->r_base_oid, oid); |
| 4289 | offset, length, 0, 0); | 4141 | ceph_oloc_copy(&req->r_base_oloc, oloc); |
| 4290 | osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, | 4142 | req->r_flags = CEPH_OSD_FLAG_READ; |
| 4291 | obj_request->pages, | ||
| 4292 | obj_request->length, | ||
| 4293 | obj_request->offset & ~PAGE_MASK, | ||
| 4294 | false, false); | ||
| 4295 | 4143 | ||
| 4296 | rbd_obj_request_submit(obj_request); | 4144 | ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); |
| 4297 | ret = rbd_obj_request_wait(obj_request); | ||
| 4298 | if (ret) | 4145 | if (ret) |
| 4299 | goto out; | 4146 | goto out_req; |
| 4300 | 4147 | ||
| 4301 | ret = obj_request->result; | 4148 | pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); |
| 4302 | if (ret < 0) | 4149 | if (IS_ERR(pages)) { |
| 4303 | goto out; | 4150 | ret = PTR_ERR(pages); |
| 4151 | goto out_req; | ||
| 4152 | } | ||
| 4304 | 4153 | ||
| 4305 | rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); | 4154 | osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0); |
| 4306 | size = (size_t) obj_request->xferred; | 4155 | osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false, |
| 4307 | ceph_copy_from_page_vector(pages, buf, 0, size); | 4156 | true); |
| 4308 | rbd_assert(size <= (size_t)INT_MAX); | 4157 | |
| 4309 | ret = (int)size; | 4158 | ceph_osdc_start_request(osdc, req, false); |
| 4310 | out: | 4159 | ret = ceph_osdc_wait_request(osdc, req); |
| 4311 | if (obj_request) | 4160 | if (ret >= 0) |
| 4312 | rbd_obj_request_put(obj_request); | 4161 | ceph_copy_from_page_vector(pages, buf, 0, ret); |
| 4313 | else | ||
| 4314 | ceph_release_page_vector(pages, page_count); | ||
| 4315 | 4162 | ||
| 4163 | out_req: | ||
| 4164 | ceph_osdc_put_request(req); | ||
| 4316 | return ret; | 4165 | return ret; |
| 4317 | } | 4166 | } |
| 4318 | 4167 | ||
| @@ -4348,8 +4197,8 @@ static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) | |||
| 4348 | if (!ondisk) | 4197 | if (!ondisk) |
| 4349 | return -ENOMEM; | 4198 | return -ENOMEM; |
| 4350 | 4199 | ||
| 4351 | ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_oid.name, | 4200 | ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid, |
| 4352 | 0, size, ondisk); | 4201 | &rbd_dev->header_oloc, ondisk, size); |
| 4353 | if (ret < 0) | 4202 | if (ret < 0) |
| 4354 | goto out; | 4203 | goto out; |
| 4355 | if ((size_t)ret < size) { | 4204 | if ((size_t)ret < size) { |
| @@ -4781,7 +4630,7 @@ static const struct attribute_group *rbd_attr_groups[] = { | |||
| 4781 | 4630 | ||
| 4782 | static void rbd_dev_release(struct device *dev); | 4631 | static void rbd_dev_release(struct device *dev); |
| 4783 | 4632 | ||
| 4784 | static struct device_type rbd_device_type = { | 4633 | static const struct device_type rbd_device_type = { |
| 4785 | .name = "rbd", | 4634 | .name = "rbd", |
| 4786 | .groups = rbd_attr_groups, | 4635 | .groups = rbd_attr_groups, |
| 4787 | .release = rbd_dev_release, | 4636 | .release = rbd_dev_release, |
| @@ -4876,8 +4725,9 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, | |||
| 4876 | INIT_LIST_HEAD(&rbd_dev->node); | 4725 | INIT_LIST_HEAD(&rbd_dev->node); |
| 4877 | init_rwsem(&rbd_dev->header_rwsem); | 4726 | init_rwsem(&rbd_dev->header_rwsem); |
| 4878 | 4727 | ||
| 4728 | rbd_dev->header.data_pool_id = CEPH_NOPOOL; | ||
| 4879 | ceph_oid_init(&rbd_dev->header_oid); | 4729 | ceph_oid_init(&rbd_dev->header_oid); |
| 4880 | ceph_oloc_init(&rbd_dev->header_oloc); | 4730 | rbd_dev->header_oloc.pool = spec->pool_id; |
| 4881 | 4731 | ||
| 4882 | mutex_init(&rbd_dev->watch_mutex); | 4732 | mutex_init(&rbd_dev->watch_mutex); |
| 4883 | rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; | 4733 | rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; |
| @@ -4899,12 +4749,6 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, | |||
| 4899 | rbd_dev->rbd_client = rbdc; | 4749 | rbd_dev->rbd_client = rbdc; |
| 4900 | rbd_dev->spec = spec; | 4750 | rbd_dev->spec = spec; |
| 4901 | 4751 | ||
| 4902 | rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER; | ||
| 4903 | rbd_dev->layout.stripe_count = 1; | ||
| 4904 | rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER; | ||
| 4905 | rbd_dev->layout.pool_id = spec->pool_id; | ||
| 4906 | RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); | ||
| 4907 | |||
| 4908 | return rbd_dev; | 4752 | return rbd_dev; |
| 4909 | } | 4753 | } |
| 4910 | 4754 | ||
| @@ -4970,10 +4814,10 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, | |||
| 4970 | __le64 size; | 4814 | __le64 size; |
| 4971 | } __attribute__ ((packed)) size_buf = { 0 }; | 4815 | } __attribute__ ((packed)) size_buf = { 0 }; |
| 4972 | 4816 | ||
| 4973 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, | 4817 | ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, |
| 4974 | "rbd", "get_size", | 4818 | &rbd_dev->header_oloc, "get_size", |
| 4975 | &snapid, sizeof (snapid), | 4819 | &snapid, sizeof(snapid), |
| 4976 | &size_buf, sizeof (size_buf)); | 4820 | &size_buf, sizeof(size_buf)); |
| 4977 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 4821 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
| 4978 | if (ret < 0) | 4822 | if (ret < 0) |
| 4979 | return ret; | 4823 | return ret; |
| @@ -5010,9 +4854,9 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) | |||
| 5010 | if (!reply_buf) | 4854 | if (!reply_buf) |
| 5011 | return -ENOMEM; | 4855 | return -ENOMEM; |
| 5012 | 4856 | ||
| 5013 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, | 4857 | ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, |
| 5014 | "rbd", "get_object_prefix", NULL, 0, | 4858 | &rbd_dev->header_oloc, "get_object_prefix", |
| 5015 | reply_buf, RBD_OBJ_PREFIX_LEN_MAX); | 4859 | NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); |
| 5016 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 4860 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
| 5017 | if (ret < 0) | 4861 | if (ret < 0) |
| 5018 | goto out; | 4862 | goto out; |
| @@ -5045,10 +4889,10 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, | |||
| 5045 | u64 unsup; | 4889 | u64 unsup; |
| 5046 | int ret; | 4890 | int ret; |
| 5047 | 4891 | ||
| 5048 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, | 4892 | ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, |
| 5049 | "rbd", "get_features", | 4893 | &rbd_dev->header_oloc, "get_features", |
| 5050 | &snapid, sizeof (snapid), | 4894 | &snapid, sizeof(snapid), |
| 5051 | &features_buf, sizeof (features_buf)); | 4895 | &features_buf, sizeof(features_buf)); |
| 5052 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 4896 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
| 5053 | if (ret < 0) | 4897 | if (ret < 0) |
| 5054 | return ret; | 4898 | return ret; |
| @@ -5107,10 +4951,9 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) | |||
| 5107 | } | 4951 | } |
| 5108 | 4952 | ||
| 5109 | snapid = cpu_to_le64(rbd_dev->spec->snap_id); | 4953 | snapid = cpu_to_le64(rbd_dev->spec->snap_id); |
| 5110 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, | 4954 | ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, |
| 5111 | "rbd", "get_parent", | 4955 | &rbd_dev->header_oloc, "get_parent", |
| 5112 | &snapid, sizeof (snapid), | 4956 | &snapid, sizeof(snapid), reply_buf, size); |
| 5113 | reply_buf, size); | ||
| 5114 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 4957 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
| 5115 | if (ret < 0) | 4958 | if (ret < 0) |
| 5116 | goto out_err; | 4959 | goto out_err; |
| @@ -5210,9 +5053,9 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) | |||
| 5210 | u64 stripe_count; | 5053 | u64 stripe_count; |
| 5211 | int ret; | 5054 | int ret; |
| 5212 | 5055 | ||
| 5213 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, | 5056 | ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, |
| 5214 | "rbd", "get_stripe_unit_count", NULL, 0, | 5057 | &rbd_dev->header_oloc, "get_stripe_unit_count", |
| 5215 | (char *)&striping_info_buf, size); | 5058 | NULL, 0, &striping_info_buf, size); |
| 5216 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 5059 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
| 5217 | if (ret < 0) | 5060 | if (ret < 0) |
| 5218 | return ret; | 5061 | return ret; |
| @@ -5226,7 +5069,7 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) | |||
| 5226 | * out, and only fail if the image has non-default values. | 5069 | * out, and only fail if the image has non-default values. |
| 5227 | */ | 5070 | */ |
| 5228 | ret = -EINVAL; | 5071 | ret = -EINVAL; |
| 5229 | obj_size = (u64)1 << rbd_dev->header.obj_order; | 5072 | obj_size = rbd_obj_bytes(&rbd_dev->header); |
| 5230 | p = &striping_info_buf; | 5073 | p = &striping_info_buf; |
| 5231 | stripe_unit = ceph_decode_64(&p); | 5074 | stripe_unit = ceph_decode_64(&p); |
| 5232 | if (stripe_unit != obj_size) { | 5075 | if (stripe_unit != obj_size) { |
| @@ -5247,8 +5090,27 @@ static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) | |||
| 5247 | return 0; | 5090 | return 0; |
| 5248 | } | 5091 | } |
| 5249 | 5092 | ||
| 5093 | static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev) | ||
| 5094 | { | ||
| 5095 | __le64 data_pool_id; | ||
| 5096 | int ret; | ||
| 5097 | |||
| 5098 | ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, | ||
| 5099 | &rbd_dev->header_oloc, "get_data_pool", | ||
| 5100 | NULL, 0, &data_pool_id, sizeof(data_pool_id)); | ||
| 5101 | if (ret < 0) | ||
| 5102 | return ret; | ||
| 5103 | if (ret < sizeof(data_pool_id)) | ||
| 5104 | return -EBADMSG; | ||
| 5105 | |||
| 5106 | rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id); | ||
| 5107 | WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL); | ||
| 5108 | return 0; | ||
| 5109 | } | ||
| 5110 | |||
| 5250 | static char *rbd_dev_image_name(struct rbd_device *rbd_dev) | 5111 | static char *rbd_dev_image_name(struct rbd_device *rbd_dev) |
| 5251 | { | 5112 | { |
| 5113 | CEPH_DEFINE_OID_ONSTACK(oid); | ||
| 5252 | size_t image_id_size; | 5114 | size_t image_id_size; |
| 5253 | char *image_id; | 5115 | char *image_id; |
| 5254 | void *p; | 5116 | void *p; |
| @@ -5276,10 +5138,10 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) | |||
| 5276 | if (!reply_buf) | 5138 | if (!reply_buf) |
| 5277 | goto out; | 5139 | goto out; |
| 5278 | 5140 | ||
| 5279 | ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, | 5141 | ceph_oid_printf(&oid, "%s", RBD_DIRECTORY); |
| 5280 | "rbd", "dir_get_name", | 5142 | ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, |
| 5281 | image_id, image_id_size, | 5143 | "dir_get_name", image_id, image_id_size, |
| 5282 | reply_buf, size); | 5144 | reply_buf, size); |
| 5283 | if (ret < 0) | 5145 | if (ret < 0) |
| 5284 | goto out; | 5146 | goto out; |
| 5285 | p = reply_buf; | 5147 | p = reply_buf; |
| @@ -5458,9 +5320,9 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) | |||
| 5458 | if (!reply_buf) | 5320 | if (!reply_buf) |
| 5459 | return -ENOMEM; | 5321 | return -ENOMEM; |
| 5460 | 5322 | ||
| 5461 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, | 5323 | ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, |
| 5462 | "rbd", "get_snapcontext", NULL, 0, | 5324 | &rbd_dev->header_oloc, "get_snapcontext", |
| 5463 | reply_buf, size); | 5325 | NULL, 0, reply_buf, size); |
| 5464 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 5326 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
| 5465 | if (ret < 0) | 5327 | if (ret < 0) |
| 5466 | goto out; | 5328 | goto out; |
| @@ -5523,10 +5385,9 @@ static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, | |||
| 5523 | return ERR_PTR(-ENOMEM); | 5385 | return ERR_PTR(-ENOMEM); |
| 5524 | 5386 | ||
| 5525 | snapid = cpu_to_le64(snap_id); | 5387 | snapid = cpu_to_le64(snap_id); |
| 5526 | ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_oid.name, | 5388 | ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, |
| 5527 | "rbd", "get_snapshot_name", | 5389 | &rbd_dev->header_oloc, "get_snapshot_name", |
| 5528 | &snapid, sizeof (snapid), | 5390 | &snapid, sizeof(snapid), reply_buf, size); |
| 5529 | reply_buf, size); | ||
| 5530 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 5391 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
| 5531 | if (ret < 0) { | 5392 | if (ret < 0) { |
| 5532 | snap_name = ERR_PTR(ret); | 5393 | snap_name = ERR_PTR(ret); |
| @@ -5833,7 +5694,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) | |||
| 5833 | { | 5694 | { |
| 5834 | int ret; | 5695 | int ret; |
| 5835 | size_t size; | 5696 | size_t size; |
| 5836 | char *object_name; | 5697 | CEPH_DEFINE_OID_ONSTACK(oid); |
| 5837 | void *response; | 5698 | void *response; |
| 5838 | char *image_id; | 5699 | char *image_id; |
| 5839 | 5700 | ||
| @@ -5853,12 +5714,12 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) | |||
| 5853 | * First, see if the format 2 image id file exists, and if | 5714 | * First, see if the format 2 image id file exists, and if |
| 5854 | * so, get the image's persistent id from it. | 5715 | * so, get the image's persistent id from it. |
| 5855 | */ | 5716 | */ |
| 5856 | size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); | 5717 | ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX, |
| 5857 | object_name = kmalloc(size, GFP_NOIO); | 5718 | rbd_dev->spec->image_name); |
| 5858 | if (!object_name) | 5719 | if (ret) |
| 5859 | return -ENOMEM; | 5720 | return ret; |
| 5860 | sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); | 5721 | |
| 5861 | dout("rbd id object name is %s\n", object_name); | 5722 | dout("rbd id object name is %s\n", oid.name); |
| 5862 | 5723 | ||
| 5863 | /* Response will be an encoded string, which includes a length */ | 5724 | /* Response will be an encoded string, which includes a length */ |
| 5864 | 5725 | ||
| @@ -5871,9 +5732,9 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) | |||
| 5871 | 5732 | ||
| 5872 | /* If it doesn't exist we'll assume it's a format 1 image */ | 5733 | /* If it doesn't exist we'll assume it's a format 1 image */ |
| 5873 | 5734 | ||
| 5874 | ret = rbd_obj_method_sync(rbd_dev, object_name, | 5735 | ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, |
| 5875 | "rbd", "get_id", NULL, 0, | 5736 | "get_id", NULL, 0, |
| 5876 | response, RBD_IMAGE_ID_LEN_MAX); | 5737 | response, RBD_IMAGE_ID_LEN_MAX); |
| 5877 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 5738 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
| 5878 | if (ret == -ENOENT) { | 5739 | if (ret == -ENOENT) { |
| 5879 | image_id = kstrdup("", GFP_KERNEL); | 5740 | image_id = kstrdup("", GFP_KERNEL); |
| @@ -5896,8 +5757,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) | |||
| 5896 | } | 5757 | } |
| 5897 | out: | 5758 | out: |
| 5898 | kfree(response); | 5759 | kfree(response); |
| 5899 | kfree(object_name); | 5760 | ceph_oid_destroy(&oid); |
| 5900 | |||
| 5901 | return ret; | 5761 | return ret; |
| 5902 | } | 5762 | } |
| 5903 | 5763 | ||
| @@ -5944,14 +5804,20 @@ static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) | |||
| 5944 | if (ret < 0) | 5804 | if (ret < 0) |
| 5945 | goto out_err; | 5805 | goto out_err; |
| 5946 | } | 5806 | } |
| 5947 | /* No support for crypto and compression type format 2 images */ | ||
| 5948 | 5807 | ||
| 5808 | if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) { | ||
| 5809 | ret = rbd_dev_v2_data_pool(rbd_dev); | ||
| 5810 | if (ret) | ||
| 5811 | goto out_err; | ||
| 5812 | } | ||
| 5813 | |||
| 5814 | rbd_init_layout(rbd_dev); | ||
| 5949 | return 0; | 5815 | return 0; |
| 5816 | |||
| 5950 | out_err: | 5817 | out_err: |
| 5951 | rbd_dev->header.features = 0; | 5818 | rbd_dev->header.features = 0; |
| 5952 | kfree(rbd_dev->header.object_prefix); | 5819 | kfree(rbd_dev->header.object_prefix); |
| 5953 | rbd_dev->header.object_prefix = NULL; | 5820 | rbd_dev->header.object_prefix = NULL; |
| 5954 | |||
| 5955 | return ret; | 5821 | return ret; |
| 5956 | } | 5822 | } |
| 5957 | 5823 | ||
| @@ -6077,8 +5943,6 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev) | |||
| 6077 | /* Record the header object name for this rbd image. */ | 5943 | /* Record the header object name for this rbd image. */ |
| 6078 | 5944 | ||
| 6079 | rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); | 5945 | rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); |
| 6080 | |||
| 6081 | rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id; | ||
| 6082 | if (rbd_dev->image_format == 1) | 5946 | if (rbd_dev->image_format == 1) |
| 6083 | ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", | 5947 | ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", |
| 6084 | spec->image_name, RBD_SUFFIX); | 5948 | spec->image_name, RBD_SUFFIX); |
| @@ -6471,27 +6335,16 @@ static int rbd_slab_init(void) | |||
| 6471 | if (!rbd_obj_request_cache) | 6335 | if (!rbd_obj_request_cache) |
| 6472 | goto out_err; | 6336 | goto out_err; |
| 6473 | 6337 | ||
| 6474 | rbd_assert(!rbd_segment_name_cache); | 6338 | return 0; |
| 6475 | rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", | ||
| 6476 | CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); | ||
| 6477 | if (rbd_segment_name_cache) | ||
| 6478 | return 0; | ||
| 6479 | out_err: | ||
| 6480 | kmem_cache_destroy(rbd_obj_request_cache); | ||
| 6481 | rbd_obj_request_cache = NULL; | ||
| 6482 | 6339 | ||
| 6340 | out_err: | ||
| 6483 | kmem_cache_destroy(rbd_img_request_cache); | 6341 | kmem_cache_destroy(rbd_img_request_cache); |
| 6484 | rbd_img_request_cache = NULL; | 6342 | rbd_img_request_cache = NULL; |
| 6485 | |||
| 6486 | return -ENOMEM; | 6343 | return -ENOMEM; |
| 6487 | } | 6344 | } |
| 6488 | 6345 | ||
| 6489 | static void rbd_slab_exit(void) | 6346 | static void rbd_slab_exit(void) |
| 6490 | { | 6347 | { |
| 6491 | rbd_assert(rbd_segment_name_cache); | ||
| 6492 | kmem_cache_destroy(rbd_segment_name_cache); | ||
| 6493 | rbd_segment_name_cache = NULL; | ||
| 6494 | |||
| 6495 | rbd_assert(rbd_obj_request_cache); | 6348 | rbd_assert(rbd_obj_request_cache); |
| 6496 | kmem_cache_destroy(rbd_obj_request_cache); | 6349 | kmem_cache_destroy(rbd_obj_request_cache); |
| 6497 | rbd_obj_request_cache = NULL; | 6350 | rbd_obj_request_cache = NULL; |
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index 94f367db27b0..62ff50d3e7a6 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h | |||
| @@ -25,8 +25,8 @@ | |||
| 25 | */ | 25 | */ |
| 26 | 26 | ||
| 27 | #define RBD_HEADER_PREFIX "rbd_header." | 27 | #define RBD_HEADER_PREFIX "rbd_header." |
| 28 | #define RBD_DATA_PREFIX "rbd_data." | ||
| 29 | #define RBD_ID_PREFIX "rbd_id." | 28 | #define RBD_ID_PREFIX "rbd_id." |
| 29 | #define RBD_V2_DATA_FORMAT "%s.%016llx" | ||
| 30 | 30 | ||
| 31 | #define RBD_LOCK_NAME "rbd_lock" | 31 | #define RBD_LOCK_NAME "rbd_lock" |
| 32 | #define RBD_LOCK_TAG "internal" | 32 | #define RBD_LOCK_TAG "internal" |
| @@ -42,13 +42,14 @@ enum rbd_notify_op { | |||
| 42 | /* | 42 | /* |
| 43 | * For format version 1, rbd image 'foo' consists of objects | 43 | * For format version 1, rbd image 'foo' consists of objects |
| 44 | * foo.rbd - image metadata | 44 | * foo.rbd - image metadata |
| 45 | * rb.<idhi>.<idlo>.00000000 | 45 | * rb.<idhi>.<idlo>.<extra>.000000000000 |
| 46 | * rb.<idhi>.<idlo>.00000001 | 46 | * rb.<idhi>.<idlo>.<extra>.000000000001 |
| 47 | * ... - data | 47 | * ... - data |
| 48 | * There is no notion of a persistent image id in rbd format 1. | 48 | * There is no notion of a persistent image id in rbd format 1. |
| 49 | */ | 49 | */ |
| 50 | 50 | ||
| 51 | #define RBD_SUFFIX ".rbd" | 51 | #define RBD_SUFFIX ".rbd" |
| 52 | #define RBD_V1_DATA_FORMAT "%s.%012llx" | ||
| 52 | 53 | ||
| 53 | #define RBD_DIRECTORY "rbd_directory" | 54 | #define RBD_DIRECTORY "rbd_directory" |
| 54 | #define RBD_INFO "rbd_info" | 55 | #define RBD_INFO "rbd_info" |
| @@ -57,9 +58,6 @@ enum rbd_notify_op { | |||
| 57 | #define RBD_MIN_OBJ_ORDER 16 | 58 | #define RBD_MIN_OBJ_ORDER 16 |
| 58 | #define RBD_MAX_OBJ_ORDER 30 | 59 | #define RBD_MAX_OBJ_ORDER 30 |
| 59 | 60 | ||
| 60 | #define RBD_COMP_NONE 0 | ||
| 61 | #define RBD_CRYPT_NONE 0 | ||
| 62 | |||
| 63 | #define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n" | 61 | #define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n" |
| 64 | #define RBD_HEADER_SIGNATURE "RBD" | 62 | #define RBD_HEADER_SIGNATURE "RBD" |
| 65 | #define RBD_HEADER_VERSION "001.005" | 63 | #define RBD_HEADER_VERSION "001.005" |
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 7ce35aec8c76..f297a9e18642 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
| @@ -391,6 +391,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) | |||
| 391 | nr_pages = i; | 391 | nr_pages = i; |
| 392 | if (nr_pages > 0) { | 392 | if (nr_pages > 0) { |
| 393 | len = nr_pages << PAGE_SHIFT; | 393 | len = nr_pages << PAGE_SHIFT; |
| 394 | osd_req_op_extent_update(req, 0, len); | ||
| 394 | break; | 395 | break; |
| 395 | } | 396 | } |
| 396 | goto out_pages; | 397 | goto out_pages; |
| @@ -771,7 +772,7 @@ static int ceph_writepages_start(struct address_space *mapping, | |||
| 771 | wbc->sync_mode == WB_SYNC_NONE ? "NONE" : | 772 | wbc->sync_mode == WB_SYNC_NONE ? "NONE" : |
| 772 | (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); | 773 | (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); |
| 773 | 774 | ||
| 774 | if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { | 775 | if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { |
| 775 | if (ci->i_wrbuffer_ref > 0) { | 776 | if (ci->i_wrbuffer_ref > 0) { |
| 776 | pr_warn_ratelimited( | 777 | pr_warn_ratelimited( |
| 777 | "writepage_start %p %lld forced umount\n", | 778 | "writepage_start %p %lld forced umount\n", |
| @@ -1017,8 +1018,7 @@ new_request: | |||
| 1017 | &ci->i_layout, vino, | 1018 | &ci->i_layout, vino, |
| 1018 | offset, &len, 0, num_ops, | 1019 | offset, &len, 0, num_ops, |
| 1019 | CEPH_OSD_OP_WRITE, | 1020 | CEPH_OSD_OP_WRITE, |
| 1020 | CEPH_OSD_FLAG_WRITE | | 1021 | CEPH_OSD_FLAG_WRITE, |
| 1021 | CEPH_OSD_FLAG_ONDISK, | ||
| 1022 | snapc, truncate_seq, | 1022 | snapc, truncate_seq, |
| 1023 | truncate_size, false); | 1023 | truncate_size, false); |
| 1024 | if (IS_ERR(req)) { | 1024 | if (IS_ERR(req)) { |
| @@ -1028,8 +1028,7 @@ new_request: | |||
| 1028 | min(num_ops, | 1028 | min(num_ops, |
| 1029 | CEPH_OSD_SLAB_OPS), | 1029 | CEPH_OSD_SLAB_OPS), |
| 1030 | CEPH_OSD_OP_WRITE, | 1030 | CEPH_OSD_OP_WRITE, |
| 1031 | CEPH_OSD_FLAG_WRITE | | 1031 | CEPH_OSD_FLAG_WRITE, |
| 1032 | CEPH_OSD_FLAG_ONDISK, | ||
| 1033 | snapc, truncate_seq, | 1032 | snapc, truncate_seq, |
| 1034 | truncate_size, true); | 1033 | truncate_size, true); |
| 1035 | BUG_ON(IS_ERR(req)); | 1034 | BUG_ON(IS_ERR(req)); |
| @@ -1194,7 +1193,7 @@ static int ceph_update_writeable_page(struct file *file, | |||
| 1194 | int r; | 1193 | int r; |
| 1195 | struct ceph_snap_context *snapc, *oldest; | 1194 | struct ceph_snap_context *snapc, *oldest; |
| 1196 | 1195 | ||
| 1197 | if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { | 1196 | if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { |
| 1198 | dout(" page %p forced umount\n", page); | 1197 | dout(" page %p forced umount\n", page); |
| 1199 | unlock_page(page); | 1198 | unlock_page(page); |
| 1200 | return -EIO; | 1199 | return -EIO; |
| @@ -1681,8 +1680,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) | |||
| 1681 | 1680 | ||
| 1682 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, | 1681 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, |
| 1683 | ceph_vino(inode), 0, &len, 0, 1, | 1682 | ceph_vino(inode), 0, &len, 0, 1, |
| 1684 | CEPH_OSD_OP_CREATE, | 1683 | CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE, |
| 1685 | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, | ||
| 1686 | NULL, 0, 0, false); | 1684 | NULL, 0, 0, false); |
| 1687 | if (IS_ERR(req)) { | 1685 | if (IS_ERR(req)) { |
| 1688 | err = PTR_ERR(req); | 1686 | err = PTR_ERR(req); |
| @@ -1699,8 +1697,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) | |||
| 1699 | 1697 | ||
| 1700 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, | 1698 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, |
| 1701 | ceph_vino(inode), 0, &len, 1, 3, | 1699 | ceph_vino(inode), 0, &len, 1, 3, |
| 1702 | CEPH_OSD_OP_WRITE, | 1700 | CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, |
| 1703 | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, | ||
| 1704 | NULL, ci->i_truncate_seq, | 1701 | NULL, ci->i_truncate_seq, |
| 1705 | ci->i_truncate_size, false); | 1702 | ci->i_truncate_size, false); |
| 1706 | if (IS_ERR(req)) { | 1703 | if (IS_ERR(req)) { |
| @@ -1873,7 +1870,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, | |||
| 1873 | goto out_unlock; | 1870 | goto out_unlock; |
| 1874 | } | 1871 | } |
| 1875 | 1872 | ||
| 1876 | wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK; | 1873 | wr_req->r_flags = CEPH_OSD_FLAG_WRITE; |
| 1877 | osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); | 1874 | osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); |
| 1878 | ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); | 1875 | ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); |
| 1879 | ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); | 1876 | ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); |
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 5bc5d37b1217..4e7421caf380 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c | |||
| @@ -234,7 +234,7 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) | |||
| 234 | fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable, | 234 | fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable, |
| 235 | inode); | 235 | inode); |
| 236 | if (fscache_cookie_enabled(ci->fscache)) { | 236 | if (fscache_cookie_enabled(ci->fscache)) { |
| 237 | dout("fscache_file_set_cookie %p %p enabing cache\n", | 237 | dout("fscache_file_set_cookie %p %p enabling cache\n", |
| 238 | inode, filp); | 238 | inode, filp); |
| 239 | } | 239 | } |
| 240 | } | 240 | } |
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 94fd76d04683..cd966f276a8d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c | |||
| @@ -867,7 +867,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) | |||
| 867 | /* | 867 | /* |
| 868 | * Return caps we have registered with the MDS(s) as 'wanted'. | 868 | * Return caps we have registered with the MDS(s) as 'wanted'. |
| 869 | */ | 869 | */ |
| 870 | int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) | 870 | int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check) |
| 871 | { | 871 | { |
| 872 | struct ceph_cap *cap; | 872 | struct ceph_cap *cap; |
| 873 | struct rb_node *p; | 873 | struct rb_node *p; |
| @@ -875,7 +875,7 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci) | |||
| 875 | 875 | ||
| 876 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | 876 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { |
| 877 | cap = rb_entry(p, struct ceph_cap, ci_node); | 877 | cap = rb_entry(p, struct ceph_cap, ci_node); |
| 878 | if (!__cap_is_valid(cap)) | 878 | if (check && !__cap_is_valid(cap)) |
| 879 | continue; | 879 | continue; |
| 880 | if (cap == ci->i_auth_cap) | 880 | if (cap == ci->i_auth_cap) |
| 881 | mds_wanted |= cap->mds_wanted; | 881 | mds_wanted |= cap->mds_wanted; |
| @@ -1184,6 +1184,13 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, | |||
| 1184 | delayed = 1; | 1184 | delayed = 1; |
| 1185 | } | 1185 | } |
| 1186 | ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); | 1186 | ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); |
| 1187 | if (want & ~cap->mds_wanted) { | ||
| 1188 | /* user space may open/close single file frequently. | ||
| 1189 | * This avoids droping mds_wanted immediately after | ||
| 1190 | * requesting new mds_wanted. | ||
| 1191 | */ | ||
| 1192 | __cap_set_timeouts(mdsc, ci); | ||
| 1193 | } | ||
| 1187 | 1194 | ||
| 1188 | cap->issued &= retain; /* drop bits we don't want */ | 1195 | cap->issued &= retain; /* drop bits we don't want */ |
| 1189 | if (cap->implemented & ~cap->issued) { | 1196 | if (cap->implemented & ~cap->issued) { |
| @@ -2084,8 +2091,6 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) | |||
| 2084 | 2091 | ||
| 2085 | dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); | 2092 | dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); |
| 2086 | 2093 | ||
| 2087 | ceph_sync_write_wait(inode); | ||
| 2088 | |||
| 2089 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); | 2094 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); |
| 2090 | if (ret < 0) | 2095 | if (ret < 0) |
| 2091 | goto out; | 2096 | goto out; |
| @@ -2477,23 +2482,22 @@ again: | |||
| 2477 | 2482 | ||
| 2478 | if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { | 2483 | if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { |
| 2479 | int mds_wanted; | 2484 | int mds_wanted; |
| 2480 | if (ACCESS_ONCE(mdsc->fsc->mount_state) == | 2485 | if (READ_ONCE(mdsc->fsc->mount_state) == |
| 2481 | CEPH_MOUNT_SHUTDOWN) { | 2486 | CEPH_MOUNT_SHUTDOWN) { |
| 2482 | dout("get_cap_refs %p forced umount\n", inode); | 2487 | dout("get_cap_refs %p forced umount\n", inode); |
| 2483 | *err = -EIO; | 2488 | *err = -EIO; |
| 2484 | ret = 1; | 2489 | ret = 1; |
| 2485 | goto out_unlock; | 2490 | goto out_unlock; |
| 2486 | } | 2491 | } |
| 2487 | mds_wanted = __ceph_caps_mds_wanted(ci); | 2492 | mds_wanted = __ceph_caps_mds_wanted(ci, false); |
| 2488 | if ((mds_wanted & need) != need) { | 2493 | if (need & ~(mds_wanted & need)) { |
| 2489 | dout("get_cap_refs %p caps were dropped" | 2494 | dout("get_cap_refs %p caps were dropped" |
| 2490 | " (session killed?)\n", inode); | 2495 | " (session killed?)\n", inode); |
| 2491 | *err = -ESTALE; | 2496 | *err = -ESTALE; |
| 2492 | ret = 1; | 2497 | ret = 1; |
| 2493 | goto out_unlock; | 2498 | goto out_unlock; |
| 2494 | } | 2499 | } |
| 2495 | if ((mds_wanted & file_wanted) == | 2500 | if (!(file_wanted & ~mds_wanted)) |
| 2496 | (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR))) | ||
| 2497 | ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; | 2501 | ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; |
| 2498 | } | 2502 | } |
| 2499 | 2503 | ||
| @@ -3404,6 +3408,7 @@ retry: | |||
| 3404 | tcap->implemented |= issued; | 3408 | tcap->implemented |= issued; |
| 3405 | if (cap == ci->i_auth_cap) | 3409 | if (cap == ci->i_auth_cap) |
| 3406 | ci->i_auth_cap = tcap; | 3410 | ci->i_auth_cap = tcap; |
| 3411 | |||
| 3407 | if (!list_empty(&ci->i_cap_flush_list) && | 3412 | if (!list_empty(&ci->i_cap_flush_list) && |
| 3408 | ci->i_auth_cap == tcap) { | 3413 | ci->i_auth_cap == tcap) { |
| 3409 | spin_lock(&mdsc->cap_dirty_lock); | 3414 | spin_lock(&mdsc->cap_dirty_lock); |
| @@ -3417,9 +3422,18 @@ retry: | |||
| 3417 | } else if (tsession) { | 3422 | } else if (tsession) { |
| 3418 | /* add placeholder for the export tagert */ | 3423 | /* add placeholder for the export tagert */ |
| 3419 | int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; | 3424 | int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; |
| 3425 | tcap = new_cap; | ||
| 3420 | ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, | 3426 | ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, |
| 3421 | t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); | 3427 | t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); |
| 3422 | 3428 | ||
| 3429 | if (!list_empty(&ci->i_cap_flush_list) && | ||
| 3430 | ci->i_auth_cap == tcap) { | ||
| 3431 | spin_lock(&mdsc->cap_dirty_lock); | ||
| 3432 | list_move_tail(&ci->i_flushing_item, | ||
| 3433 | &tcap->session->s_cap_flushing); | ||
| 3434 | spin_unlock(&mdsc->cap_dirty_lock); | ||
| 3435 | } | ||
| 3436 | |||
| 3423 | __ceph_remove_cap(cap, false); | 3437 | __ceph_remove_cap(cap, false); |
| 3424 | goto out_unlock; | 3438 | goto out_unlock; |
| 3425 | } | 3439 | } |
| @@ -3924,9 +3938,10 @@ int ceph_encode_inode_release(void **p, struct inode *inode, | |||
| 3924 | } | 3938 | } |
| 3925 | 3939 | ||
| 3926 | int ceph_encode_dentry_release(void **p, struct dentry *dentry, | 3940 | int ceph_encode_dentry_release(void **p, struct dentry *dentry, |
| 3941 | struct inode *dir, | ||
| 3927 | int mds, int drop, int unless) | 3942 | int mds, int drop, int unless) |
| 3928 | { | 3943 | { |
| 3929 | struct inode *dir = d_inode(dentry->d_parent); | 3944 | struct dentry *parent = NULL; |
| 3930 | struct ceph_mds_request_release *rel = *p; | 3945 | struct ceph_mds_request_release *rel = *p; |
| 3931 | struct ceph_dentry_info *di = ceph_dentry(dentry); | 3946 | struct ceph_dentry_info *di = ceph_dentry(dentry); |
| 3932 | int force = 0; | 3947 | int force = 0; |
| @@ -3941,9 +3956,14 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, | |||
| 3941 | spin_lock(&dentry->d_lock); | 3956 | spin_lock(&dentry->d_lock); |
| 3942 | if (di->lease_session && di->lease_session->s_mds == mds) | 3957 | if (di->lease_session && di->lease_session->s_mds == mds) |
| 3943 | force = 1; | 3958 | force = 1; |
| 3959 | if (!dir) { | ||
| 3960 | parent = dget(dentry->d_parent); | ||
| 3961 | dir = d_inode(parent); | ||
| 3962 | } | ||
| 3944 | spin_unlock(&dentry->d_lock); | 3963 | spin_unlock(&dentry->d_lock); |
| 3945 | 3964 | ||
| 3946 | ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); | 3965 | ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); |
| 3966 | dput(parent); | ||
| 3947 | 3967 | ||
| 3948 | spin_lock(&dentry->d_lock); | 3968 | spin_lock(&dentry->d_lock); |
| 3949 | if (ret && di->lease_session && di->lease_session->s_mds == mds) { | 3969 | if (ret && di->lease_session && di->lease_session->s_mds == mds) { |
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 39ff678e567f..f2ae393e2c31 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c | |||
| @@ -70,7 +70,7 @@ static int mdsc_show(struct seq_file *s, void *p) | |||
| 70 | 70 | ||
| 71 | seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); | 71 | seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); |
| 72 | 72 | ||
| 73 | if (req->r_got_unsafe) | 73 | if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) |
| 74 | seq_puts(s, "\t(unsafe)"); | 74 | seq_puts(s, "\t(unsafe)"); |
| 75 | else | 75 | else |
| 76 | seq_puts(s, "\t"); | 76 | seq_puts(s, "\t"); |
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 8ab1fdf0bd49..3e9ad501addf 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c | |||
| @@ -371,7 +371,7 @@ more: | |||
| 371 | /* hints to request -> mds selection code */ | 371 | /* hints to request -> mds selection code */ |
| 372 | req->r_direct_mode = USE_AUTH_MDS; | 372 | req->r_direct_mode = USE_AUTH_MDS; |
| 373 | req->r_direct_hash = ceph_frag_value(frag); | 373 | req->r_direct_hash = ceph_frag_value(frag); |
| 374 | req->r_direct_is_hash = true; | 374 | __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); |
| 375 | if (fi->last_name) { | 375 | if (fi->last_name) { |
| 376 | req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); | 376 | req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); |
| 377 | if (!req->r_path2) { | 377 | if (!req->r_path2) { |
| @@ -417,7 +417,7 @@ more: | |||
| 417 | fi->frag = frag; | 417 | fi->frag = frag; |
| 418 | fi->last_readdir = req; | 418 | fi->last_readdir = req; |
| 419 | 419 | ||
| 420 | if (req->r_did_prepopulate) { | 420 | if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) { |
| 421 | fi->readdir_cache_idx = req->r_readdir_cache_idx; | 421 | fi->readdir_cache_idx = req->r_readdir_cache_idx; |
| 422 | if (fi->readdir_cache_idx < 0) { | 422 | if (fi->readdir_cache_idx < 0) { |
| 423 | /* preclude from marking dir ordered */ | 423 | /* preclude from marking dir ordered */ |
| @@ -752,7 +752,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, | |||
| 752 | mask |= CEPH_CAP_XATTR_SHARED; | 752 | mask |= CEPH_CAP_XATTR_SHARED; |
| 753 | req->r_args.getattr.mask = cpu_to_le32(mask); | 753 | req->r_args.getattr.mask = cpu_to_le32(mask); |
| 754 | 754 | ||
| 755 | req->r_locked_dir = dir; | 755 | req->r_parent = dir; |
| 756 | set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); | ||
| 756 | err = ceph_mdsc_do_request(mdsc, NULL, req); | 757 | err = ceph_mdsc_do_request(mdsc, NULL, req); |
| 757 | err = ceph_handle_snapdir(req, dentry, err); | 758 | err = ceph_handle_snapdir(req, dentry, err); |
| 758 | dentry = ceph_finish_lookup(req, dentry, err); | 759 | dentry = ceph_finish_lookup(req, dentry, err); |
| @@ -813,7 +814,8 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, | |||
| 813 | } | 814 | } |
| 814 | req->r_dentry = dget(dentry); | 815 | req->r_dentry = dget(dentry); |
| 815 | req->r_num_caps = 2; | 816 | req->r_num_caps = 2; |
| 816 | req->r_locked_dir = dir; | 817 | req->r_parent = dir; |
| 818 | set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); | ||
| 817 | req->r_args.mknod.mode = cpu_to_le32(mode); | 819 | req->r_args.mknod.mode = cpu_to_le32(mode); |
| 818 | req->r_args.mknod.rdev = cpu_to_le32(rdev); | 820 | req->r_args.mknod.rdev = cpu_to_le32(rdev); |
| 819 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; | 821 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; |
| @@ -864,7 +866,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, | |||
| 864 | ceph_mdsc_put_request(req); | 866 | ceph_mdsc_put_request(req); |
| 865 | goto out; | 867 | goto out; |
| 866 | } | 868 | } |
| 867 | req->r_locked_dir = dir; | 869 | req->r_parent = dir; |
| 870 | set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); | ||
| 868 | req->r_dentry = dget(dentry); | 871 | req->r_dentry = dget(dentry); |
| 869 | req->r_num_caps = 2; | 872 | req->r_num_caps = 2; |
| 870 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; | 873 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; |
| @@ -913,7 +916,8 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
| 913 | 916 | ||
| 914 | req->r_dentry = dget(dentry); | 917 | req->r_dentry = dget(dentry); |
| 915 | req->r_num_caps = 2; | 918 | req->r_num_caps = 2; |
| 916 | req->r_locked_dir = dir; | 919 | req->r_parent = dir; |
| 920 | set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); | ||
| 917 | req->r_args.mkdir.mode = cpu_to_le32(mode); | 921 | req->r_args.mkdir.mode = cpu_to_le32(mode); |
| 918 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; | 922 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; |
| 919 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; | 923 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; |
| @@ -957,7 +961,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, | |||
| 957 | req->r_dentry = dget(dentry); | 961 | req->r_dentry = dget(dentry); |
| 958 | req->r_num_caps = 2; | 962 | req->r_num_caps = 2; |
| 959 | req->r_old_dentry = dget(old_dentry); | 963 | req->r_old_dentry = dget(old_dentry); |
| 960 | req->r_locked_dir = dir; | 964 | req->r_parent = dir; |
| 965 | set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); | ||
| 961 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; | 966 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; |
| 962 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; | 967 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; |
| 963 | /* release LINK_SHARED on source inode (mds will lock it) */ | 968 | /* release LINK_SHARED on source inode (mds will lock it) */ |
| @@ -1023,7 +1028,8 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) | |||
| 1023 | } | 1028 | } |
| 1024 | req->r_dentry = dget(dentry); | 1029 | req->r_dentry = dget(dentry); |
| 1025 | req->r_num_caps = 2; | 1030 | req->r_num_caps = 2; |
| 1026 | req->r_locked_dir = dir; | 1031 | req->r_parent = dir; |
| 1032 | set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); | ||
| 1027 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; | 1033 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; |
| 1028 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; | 1034 | req->r_dentry_unless = CEPH_CAP_FILE_EXCL; |
| 1029 | req->r_inode_drop = drop_caps_for_unlink(inode); | 1035 | req->r_inode_drop = drop_caps_for_unlink(inode); |
| @@ -1066,7 +1072,8 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
| 1066 | req->r_num_caps = 2; | 1072 | req->r_num_caps = 2; |
| 1067 | req->r_old_dentry = dget(old_dentry); | 1073 | req->r_old_dentry = dget(old_dentry); |
| 1068 | req->r_old_dentry_dir = old_dir; | 1074 | req->r_old_dentry_dir = old_dir; |
| 1069 | req->r_locked_dir = new_dir; | 1075 | req->r_parent = new_dir; |
| 1076 | set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); | ||
| 1070 | req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; | 1077 | req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED; |
| 1071 | req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; | 1078 | req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL; |
| 1072 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; | 1079 | req->r_dentry_drop = CEPH_CAP_FILE_SHARED; |
| @@ -1194,7 +1201,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) | |||
| 1194 | struct inode *dir; | 1201 | struct inode *dir; |
| 1195 | 1202 | ||
| 1196 | if (flags & LOOKUP_RCU) { | 1203 | if (flags & LOOKUP_RCU) { |
| 1197 | parent = ACCESS_ONCE(dentry->d_parent); | 1204 | parent = READ_ONCE(dentry->d_parent); |
| 1198 | dir = d_inode_rcu(parent); | 1205 | dir = d_inode_rcu(parent); |
| 1199 | if (!dir) | 1206 | if (!dir) |
| 1200 | return -ECHILD; | 1207 | return -ECHILD; |
| @@ -1237,11 +1244,12 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) | |||
| 1237 | return -ECHILD; | 1244 | return -ECHILD; |
| 1238 | 1245 | ||
| 1239 | op = ceph_snap(dir) == CEPH_SNAPDIR ? | 1246 | op = ceph_snap(dir) == CEPH_SNAPDIR ? |
| 1240 | CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_GETATTR; | 1247 | CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; |
| 1241 | req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); | 1248 | req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); |
| 1242 | if (!IS_ERR(req)) { | 1249 | if (!IS_ERR(req)) { |
| 1243 | req->r_dentry = dget(dentry); | 1250 | req->r_dentry = dget(dentry); |
| 1244 | req->r_num_caps = op == CEPH_MDS_OP_GETATTR ? 1 : 2; | 1251 | req->r_num_caps = 2; |
| 1252 | req->r_parent = dir; | ||
| 1245 | 1253 | ||
| 1246 | mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; | 1254 | mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; |
| 1247 | if (ceph_security_xattr_wanted(dir)) | 1255 | if (ceph_security_xattr_wanted(dir)) |
diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 180bbef760f2..e8f11fa565c5 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c | |||
| @@ -207,7 +207,8 @@ static int ceph_get_name(struct dentry *parent, char *name, | |||
| 207 | req->r_inode = d_inode(child); | 207 | req->r_inode = d_inode(child); |
| 208 | ihold(d_inode(child)); | 208 | ihold(d_inode(child)); |
| 209 | req->r_ino2 = ceph_vino(d_inode(parent)); | 209 | req->r_ino2 = ceph_vino(d_inode(parent)); |
| 210 | req->r_locked_dir = d_inode(parent); | 210 | req->r_parent = d_inode(parent); |
| 211 | set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); | ||
| 211 | req->r_num_caps = 2; | 212 | req->r_num_caps = 2; |
| 212 | err = ceph_mdsc_do_request(mdsc, NULL, req); | 213 | err = ceph_mdsc_do_request(mdsc, NULL, req); |
| 213 | 214 | ||
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 045d30d26624..26cc95421cca 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
| @@ -283,7 +283,7 @@ int ceph_open(struct inode *inode, struct file *file) | |||
| 283 | spin_lock(&ci->i_ceph_lock); | 283 | spin_lock(&ci->i_ceph_lock); |
| 284 | if (__ceph_is_any_real_caps(ci) && | 284 | if (__ceph_is_any_real_caps(ci) && |
| 285 | (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { | 285 | (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { |
| 286 | int mds_wanted = __ceph_caps_mds_wanted(ci); | 286 | int mds_wanted = __ceph_caps_mds_wanted(ci, true); |
| 287 | int issued = __ceph_caps_issued(ci, NULL); | 287 | int issued = __ceph_caps_issued(ci, NULL); |
| 288 | 288 | ||
| 289 | dout("open %p fmode %d want %s issued %s using existing\n", | 289 | dout("open %p fmode %d want %s issued %s using existing\n", |
| @@ -379,7 +379,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, | |||
| 379 | mask |= CEPH_CAP_XATTR_SHARED; | 379 | mask |= CEPH_CAP_XATTR_SHARED; |
| 380 | req->r_args.open.mask = cpu_to_le32(mask); | 380 | req->r_args.open.mask = cpu_to_le32(mask); |
| 381 | 381 | ||
| 382 | req->r_locked_dir = dir; /* caller holds dir->i_mutex */ | 382 | req->r_parent = dir; |
| 383 | set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); | ||
| 383 | err = ceph_mdsc_do_request(mdsc, | 384 | err = ceph_mdsc_do_request(mdsc, |
| 384 | (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, | 385 | (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, |
| 385 | req); | 386 | req); |
| @@ -758,9 +759,7 @@ static void ceph_aio_retry_work(struct work_struct *work) | |||
| 758 | goto out; | 759 | goto out; |
| 759 | } | 760 | } |
| 760 | 761 | ||
| 761 | req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | | 762 | req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; |
| 762 | CEPH_OSD_FLAG_ONDISK | | ||
| 763 | CEPH_OSD_FLAG_WRITE; | ||
| 764 | ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); | 763 | ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); |
| 765 | ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); | 764 | ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); |
| 766 | 765 | ||
| @@ -794,89 +793,6 @@ out: | |||
| 794 | kfree(aio_work); | 793 | kfree(aio_work); |
| 795 | } | 794 | } |
| 796 | 795 | ||
| 797 | /* | ||
| 798 | * Write commit request unsafe callback, called to tell us when a | ||
| 799 | * request is unsafe (that is, in flight--has been handed to the | ||
| 800 | * messenger to send to its target osd). It is called again when | ||
| 801 | * we've received a response message indicating the request is | ||
| 802 | * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request | ||
| 803 | * is completed early (and unsuccessfully) due to a timeout or | ||
| 804 | * interrupt. | ||
| 805 | * | ||
| 806 | * This is used if we requested both an ACK and ONDISK commit reply | ||
| 807 | * from the OSD. | ||
| 808 | */ | ||
| 809 | static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) | ||
| 810 | { | ||
| 811 | struct ceph_inode_info *ci = ceph_inode(req->r_inode); | ||
| 812 | |||
| 813 | dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid, | ||
| 814 | unsafe ? "un" : ""); | ||
| 815 | if (unsafe) { | ||
| 816 | ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); | ||
| 817 | spin_lock(&ci->i_unsafe_lock); | ||
| 818 | list_add_tail(&req->r_unsafe_item, | ||
| 819 | &ci->i_unsafe_writes); | ||
| 820 | spin_unlock(&ci->i_unsafe_lock); | ||
| 821 | |||
| 822 | complete_all(&req->r_completion); | ||
| 823 | } else { | ||
| 824 | spin_lock(&ci->i_unsafe_lock); | ||
| 825 | list_del_init(&req->r_unsafe_item); | ||
| 826 | spin_unlock(&ci->i_unsafe_lock); | ||
| 827 | ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); | ||
| 828 | } | ||
| 829 | } | ||
| 830 | |||
| 831 | /* | ||
| 832 | * Wait on any unsafe replies for the given inode. First wait on the | ||
| 833 | * newest request, and make that the upper bound. Then, if there are | ||
| 834 | * more requests, keep waiting on the oldest as long as it is still older | ||
| 835 | * than the original request. | ||
| 836 | */ | ||
| 837 | void ceph_sync_write_wait(struct inode *inode) | ||
| 838 | { | ||
| 839 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
| 840 | struct list_head *head = &ci->i_unsafe_writes; | ||
| 841 | struct ceph_osd_request *req; | ||
| 842 | u64 last_tid; | ||
| 843 | |||
| 844 | if (!S_ISREG(inode->i_mode)) | ||
| 845 | return; | ||
| 846 | |||
| 847 | spin_lock(&ci->i_unsafe_lock); | ||
| 848 | if (list_empty(head)) | ||
| 849 | goto out; | ||
| 850 | |||
| 851 | /* set upper bound as _last_ entry in chain */ | ||
| 852 | |||
| 853 | req = list_last_entry(head, struct ceph_osd_request, | ||
| 854 | r_unsafe_item); | ||
| 855 | last_tid = req->r_tid; | ||
| 856 | |||
| 857 | do { | ||
| 858 | ceph_osdc_get_request(req); | ||
| 859 | spin_unlock(&ci->i_unsafe_lock); | ||
| 860 | |||
| 861 | dout("sync_write_wait on tid %llu (until %llu)\n", | ||
| 862 | req->r_tid, last_tid); | ||
| 863 | wait_for_completion(&req->r_done_completion); | ||
| 864 | ceph_osdc_put_request(req); | ||
| 865 | |||
| 866 | spin_lock(&ci->i_unsafe_lock); | ||
| 867 | /* | ||
| 868 | * from here on look at first entry in chain, since we | ||
| 869 | * only want to wait for anything older than last_tid | ||
| 870 | */ | ||
| 871 | if (list_empty(head)) | ||
| 872 | break; | ||
| 873 | req = list_first_entry(head, struct ceph_osd_request, | ||
| 874 | r_unsafe_item); | ||
| 875 | } while (req->r_tid < last_tid); | ||
| 876 | out: | ||
| 877 | spin_unlock(&ci->i_unsafe_lock); | ||
| 878 | } | ||
| 879 | |||
| 880 | static ssize_t | 796 | static ssize_t |
| 881 | ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, | 797 | ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, |
| 882 | struct ceph_snap_context *snapc, | 798 | struct ceph_snap_context *snapc, |
| @@ -915,9 +831,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, | |||
| 915 | if (ret2 < 0) | 831 | if (ret2 < 0) |
| 916 | dout("invalidate_inode_pages2_range returned %d\n", ret2); | 832 | dout("invalidate_inode_pages2_range returned %d\n", ret2); |
| 917 | 833 | ||
| 918 | flags = CEPH_OSD_FLAG_ORDERSNAP | | 834 | flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; |
| 919 | CEPH_OSD_FLAG_ONDISK | | ||
| 920 | CEPH_OSD_FLAG_WRITE; | ||
| 921 | } else { | 835 | } else { |
| 922 | flags = CEPH_OSD_FLAG_READ; | 836 | flags = CEPH_OSD_FLAG_READ; |
| 923 | } | 837 | } |
| @@ -1116,10 +1030,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, | |||
| 1116 | if (ret < 0) | 1030 | if (ret < 0) |
| 1117 | dout("invalidate_inode_pages2_range returned %d\n", ret); | 1031 | dout("invalidate_inode_pages2_range returned %d\n", ret); |
| 1118 | 1032 | ||
| 1119 | flags = CEPH_OSD_FLAG_ORDERSNAP | | 1033 | flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; |
| 1120 | CEPH_OSD_FLAG_ONDISK | | ||
| 1121 | CEPH_OSD_FLAG_WRITE | | ||
| 1122 | CEPH_OSD_FLAG_ACK; | ||
| 1123 | 1034 | ||
| 1124 | while ((len = iov_iter_count(from)) > 0) { | 1035 | while ((len = iov_iter_count(from)) > 0) { |
| 1125 | size_t left; | 1036 | size_t left; |
| @@ -1165,8 +1076,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, | |||
| 1165 | goto out; | 1076 | goto out; |
| 1166 | } | 1077 | } |
| 1167 | 1078 | ||
| 1168 | /* get a second commit callback */ | ||
| 1169 | req->r_unsafe_callback = ceph_sync_write_unsafe; | ||
| 1170 | req->r_inode = inode; | 1079 | req->r_inode = inode; |
| 1171 | 1080 | ||
| 1172 | osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, | 1081 | osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, |
| @@ -1616,8 +1525,7 @@ static int ceph_zero_partial_object(struct inode *inode, | |||
| 1616 | ceph_vino(inode), | 1525 | ceph_vino(inode), |
| 1617 | offset, length, | 1526 | offset, length, |
| 1618 | 0, 1, op, | 1527 | 0, 1, op, |
| 1619 | CEPH_OSD_FLAG_WRITE | | 1528 | CEPH_OSD_FLAG_WRITE, |
| 1620 | CEPH_OSD_FLAG_ONDISK, | ||
| 1621 | NULL, 0, 0, false); | 1529 | NULL, 0, 0, false); |
| 1622 | if (IS_ERR(req)) { | 1530 | if (IS_ERR(req)) { |
| 1623 | ret = PTR_ERR(req); | 1531 | ret = PTR_ERR(req); |
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5e659d054b40..fd8f771f99b7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c | |||
| @@ -499,7 +499,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) | |||
| 499 | ci->i_rdcache_gen = 0; | 499 | ci->i_rdcache_gen = 0; |
| 500 | ci->i_rdcache_revoking = 0; | 500 | ci->i_rdcache_revoking = 0; |
| 501 | 501 | ||
| 502 | INIT_LIST_HEAD(&ci->i_unsafe_writes); | ||
| 503 | INIT_LIST_HEAD(&ci->i_unsafe_dirops); | 502 | INIT_LIST_HEAD(&ci->i_unsafe_dirops); |
| 504 | INIT_LIST_HEAD(&ci->i_unsafe_iops); | 503 | INIT_LIST_HEAD(&ci->i_unsafe_iops); |
| 505 | spin_lock_init(&ci->i_unsafe_lock); | 504 | spin_lock_init(&ci->i_unsafe_lock); |
| @@ -583,14 +582,6 @@ int ceph_drop_inode(struct inode *inode) | |||
| 583 | return 1; | 582 | return 1; |
| 584 | } | 583 | } |
| 585 | 584 | ||
| 586 | void ceph_evict_inode(struct inode *inode) | ||
| 587 | { | ||
| 588 | /* wait unsafe sync writes */ | ||
| 589 | ceph_sync_write_wait(inode); | ||
| 590 | truncate_inode_pages_final(&inode->i_data); | ||
| 591 | clear_inode(inode); | ||
| 592 | } | ||
| 593 | |||
| 594 | static inline blkcnt_t calc_inode_blocks(u64 size) | 585 | static inline blkcnt_t calc_inode_blocks(u64 size) |
| 595 | { | 586 | { |
| 596 | return (size + (1<<9) - 1) >> 9; | 587 | return (size + (1<<9) - 1) >> 9; |
| @@ -1016,7 +1007,9 @@ out: | |||
| 1016 | static void update_dentry_lease(struct dentry *dentry, | 1007 | static void update_dentry_lease(struct dentry *dentry, |
| 1017 | struct ceph_mds_reply_lease *lease, | 1008 | struct ceph_mds_reply_lease *lease, |
| 1018 | struct ceph_mds_session *session, | 1009 | struct ceph_mds_session *session, |
| 1019 | unsigned long from_time) | 1010 | unsigned long from_time, |
| 1011 | struct ceph_vino *tgt_vino, | ||
| 1012 | struct ceph_vino *dir_vino) | ||
| 1020 | { | 1013 | { |
| 1021 | struct ceph_dentry_info *di = ceph_dentry(dentry); | 1014 | struct ceph_dentry_info *di = ceph_dentry(dentry); |
| 1022 | long unsigned duration = le32_to_cpu(lease->duration_ms); | 1015 | long unsigned duration = le32_to_cpu(lease->duration_ms); |
| @@ -1024,13 +1017,27 @@ static void update_dentry_lease(struct dentry *dentry, | |||
| 1024 | long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; | 1017 | long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; |
| 1025 | struct inode *dir; | 1018 | struct inode *dir; |
| 1026 | 1019 | ||
| 1020 | /* | ||
| 1021 | * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that | ||
| 1022 | * we expect a negative dentry. | ||
| 1023 | */ | ||
| 1024 | if (!tgt_vino && d_really_is_positive(dentry)) | ||
| 1025 | return; | ||
| 1026 | |||
| 1027 | if (tgt_vino && (d_really_is_negative(dentry) || | ||
| 1028 | !ceph_ino_compare(d_inode(dentry), tgt_vino))) | ||
| 1029 | return; | ||
| 1030 | |||
| 1027 | spin_lock(&dentry->d_lock); | 1031 | spin_lock(&dentry->d_lock); |
| 1028 | dout("update_dentry_lease %p duration %lu ms ttl %lu\n", | 1032 | dout("update_dentry_lease %p duration %lu ms ttl %lu\n", |
| 1029 | dentry, duration, ttl); | 1033 | dentry, duration, ttl); |
| 1030 | 1034 | ||
| 1031 | /* make lease_rdcache_gen match directory */ | ||
| 1032 | dir = d_inode(dentry->d_parent); | 1035 | dir = d_inode(dentry->d_parent); |
| 1033 | 1036 | ||
| 1037 | /* make sure parent matches dir_vino */ | ||
| 1038 | if (!ceph_ino_compare(dir, dir_vino)) | ||
| 1039 | goto out_unlock; | ||
| 1040 | |||
| 1034 | /* only track leases on regular dentries */ | 1041 | /* only track leases on regular dentries */ |
| 1035 | if (ceph_snap(dir) != CEPH_NOSNAP) | 1042 | if (ceph_snap(dir) != CEPH_NOSNAP) |
| 1036 | goto out_unlock; | 1043 | goto out_unlock; |
| @@ -1108,61 +1115,27 @@ out: | |||
| 1108 | * | 1115 | * |
| 1109 | * Called with snap_rwsem (read). | 1116 | * Called with snap_rwsem (read). |
| 1110 | */ | 1117 | */ |
| 1111 | int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, | 1118 | int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) |
| 1112 | struct ceph_mds_session *session) | ||
| 1113 | { | 1119 | { |
| 1120 | struct ceph_mds_session *session = req->r_session; | ||
| 1114 | struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; | 1121 | struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; |
| 1115 | struct inode *in = NULL; | 1122 | struct inode *in = NULL; |
| 1116 | struct ceph_vino vino; | 1123 | struct ceph_vino tvino, dvino; |
| 1117 | struct ceph_fs_client *fsc = ceph_sb_to_client(sb); | 1124 | struct ceph_fs_client *fsc = ceph_sb_to_client(sb); |
| 1118 | int err = 0; | 1125 | int err = 0; |
| 1119 | 1126 | ||
| 1120 | dout("fill_trace %p is_dentry %d is_target %d\n", req, | 1127 | dout("fill_trace %p is_dentry %d is_target %d\n", req, |
| 1121 | rinfo->head->is_dentry, rinfo->head->is_target); | 1128 | rinfo->head->is_dentry, rinfo->head->is_target); |
| 1122 | 1129 | ||
| 1123 | #if 0 | ||
| 1124 | /* | ||
| 1125 | * Debugging hook: | ||
| 1126 | * | ||
| 1127 | * If we resend completed ops to a recovering mds, we get no | ||
| 1128 | * trace. Since that is very rare, pretend this is the case | ||
| 1129 | * to ensure the 'no trace' handlers in the callers behave. | ||
| 1130 | * | ||
| 1131 | * Fill in inodes unconditionally to avoid breaking cap | ||
| 1132 | * invariants. | ||
| 1133 | */ | ||
| 1134 | if (rinfo->head->op & CEPH_MDS_OP_WRITE) { | ||
| 1135 | pr_info("fill_trace faking empty trace on %lld %s\n", | ||
| 1136 | req->r_tid, ceph_mds_op_name(rinfo->head->op)); | ||
| 1137 | if (rinfo->head->is_dentry) { | ||
| 1138 | rinfo->head->is_dentry = 0; | ||
| 1139 | err = fill_inode(req->r_locked_dir, | ||
| 1140 | &rinfo->diri, rinfo->dirfrag, | ||
| 1141 | session, req->r_request_started, -1); | ||
| 1142 | } | ||
| 1143 | if (rinfo->head->is_target) { | ||
| 1144 | rinfo->head->is_target = 0; | ||
| 1145 | ininfo = rinfo->targeti.in; | ||
| 1146 | vino.ino = le64_to_cpu(ininfo->ino); | ||
| 1147 | vino.snap = le64_to_cpu(ininfo->snapid); | ||
| 1148 | in = ceph_get_inode(sb, vino); | ||
| 1149 | err = fill_inode(in, &rinfo->targeti, NULL, | ||
| 1150 | session, req->r_request_started, | ||
| 1151 | req->r_fmode); | ||
| 1152 | iput(in); | ||
| 1153 | } | ||
| 1154 | } | ||
| 1155 | #endif | ||
| 1156 | |||
| 1157 | if (!rinfo->head->is_target && !rinfo->head->is_dentry) { | 1130 | if (!rinfo->head->is_target && !rinfo->head->is_dentry) { |
| 1158 | dout("fill_trace reply is empty!\n"); | 1131 | dout("fill_trace reply is empty!\n"); |
| 1159 | if (rinfo->head->result == 0 && req->r_locked_dir) | 1132 | if (rinfo->head->result == 0 && req->r_parent) |
| 1160 | ceph_invalidate_dir_request(req); | 1133 | ceph_invalidate_dir_request(req); |
| 1161 | return 0; | 1134 | return 0; |
| 1162 | } | 1135 | } |
| 1163 | 1136 | ||
| 1164 | if (rinfo->head->is_dentry) { | 1137 | if (rinfo->head->is_dentry) { |
| 1165 | struct inode *dir = req->r_locked_dir; | 1138 | struct inode *dir = req->r_parent; |
| 1166 | 1139 | ||
| 1167 | if (dir) { | 1140 | if (dir) { |
| 1168 | err = fill_inode(dir, NULL, | 1141 | err = fill_inode(dir, NULL, |
| @@ -1188,8 +1161,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, | |||
| 1188 | dname.name = rinfo->dname; | 1161 | dname.name = rinfo->dname; |
| 1189 | dname.len = rinfo->dname_len; | 1162 | dname.len = rinfo->dname_len; |
| 1190 | dname.hash = full_name_hash(parent, dname.name, dname.len); | 1163 | dname.hash = full_name_hash(parent, dname.name, dname.len); |
| 1191 | vino.ino = le64_to_cpu(rinfo->targeti.in->ino); | 1164 | tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); |
| 1192 | vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); | 1165 | tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); |
| 1193 | retry_lookup: | 1166 | retry_lookup: |
| 1194 | dn = d_lookup(parent, &dname); | 1167 | dn = d_lookup(parent, &dname); |
| 1195 | dout("d_lookup on parent=%p name=%.*s got %p\n", | 1168 | dout("d_lookup on parent=%p name=%.*s got %p\n", |
| @@ -1206,8 +1179,8 @@ retry_lookup: | |||
| 1206 | } | 1179 | } |
| 1207 | err = 0; | 1180 | err = 0; |
| 1208 | } else if (d_really_is_positive(dn) && | 1181 | } else if (d_really_is_positive(dn) && |
| 1209 | (ceph_ino(d_inode(dn)) != vino.ino || | 1182 | (ceph_ino(d_inode(dn)) != tvino.ino || |
| 1210 | ceph_snap(d_inode(dn)) != vino.snap)) { | 1183 | ceph_snap(d_inode(dn)) != tvino.snap)) { |
| 1211 | dout(" dn %p points to wrong inode %p\n", | 1184 | dout(" dn %p points to wrong inode %p\n", |
| 1212 | dn, d_inode(dn)); | 1185 | dn, d_inode(dn)); |
| 1213 | d_delete(dn); | 1186 | d_delete(dn); |
| @@ -1221,10 +1194,10 @@ retry_lookup: | |||
| 1221 | } | 1194 | } |
| 1222 | 1195 | ||
| 1223 | if (rinfo->head->is_target) { | 1196 | if (rinfo->head->is_target) { |
| 1224 | vino.ino = le64_to_cpu(rinfo->targeti.in->ino); | 1197 | tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); |
| 1225 | vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); | 1198 | tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); |
| 1226 | 1199 | ||
| 1227 | in = ceph_get_inode(sb, vino); | 1200 | in = ceph_get_inode(sb, tvino); |
| 1228 | if (IS_ERR(in)) { | 1201 | if (IS_ERR(in)) { |
| 1229 | err = PTR_ERR(in); | 1202 | err = PTR_ERR(in); |
| 1230 | goto done; | 1203 | goto done; |
| @@ -1233,8 +1206,8 @@ retry_lookup: | |||
| 1233 | 1206 | ||
| 1234 | err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, | 1207 | err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, |
| 1235 | session, req->r_request_started, | 1208 | session, req->r_request_started, |
| 1236 | (!req->r_aborted && rinfo->head->result == 0) ? | 1209 | (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && |
| 1237 | req->r_fmode : -1, | 1210 | rinfo->head->result == 0) ? req->r_fmode : -1, |
| 1238 | &req->r_caps_reservation); | 1211 | &req->r_caps_reservation); |
| 1239 | if (err < 0) { | 1212 | if (err < 0) { |
| 1240 | pr_err("fill_inode badness %p %llx.%llx\n", | 1213 | pr_err("fill_inode badness %p %llx.%llx\n", |
| @@ -1247,8 +1220,9 @@ retry_lookup: | |||
| 1247 | * ignore null lease/binding on snapdir ENOENT, or else we | 1220 | * ignore null lease/binding on snapdir ENOENT, or else we |
| 1248 | * will have trouble splicing in the virtual snapdir later | 1221 | * will have trouble splicing in the virtual snapdir later |
| 1249 | */ | 1222 | */ |
| 1250 | if (rinfo->head->is_dentry && !req->r_aborted && | 1223 | if (rinfo->head->is_dentry && |
| 1251 | req->r_locked_dir && | 1224 | !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && |
| 1225 | test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && | ||
| 1252 | (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, | 1226 | (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, |
| 1253 | fsc->mount_options->snapdir_name, | 1227 | fsc->mount_options->snapdir_name, |
| 1254 | req->r_dentry->d_name.len))) { | 1228 | req->r_dentry->d_name.len))) { |
| @@ -1257,17 +1231,19 @@ retry_lookup: | |||
| 1257 | * mknod symlink mkdir : null -> new inode | 1231 | * mknod symlink mkdir : null -> new inode |
| 1258 | * unlink : linked -> null | 1232 | * unlink : linked -> null |
| 1259 | */ | 1233 | */ |
| 1260 | struct inode *dir = req->r_locked_dir; | 1234 | struct inode *dir = req->r_parent; |
| 1261 | struct dentry *dn = req->r_dentry; | 1235 | struct dentry *dn = req->r_dentry; |
| 1262 | bool have_dir_cap, have_lease; | 1236 | bool have_dir_cap, have_lease; |
| 1263 | 1237 | ||
| 1264 | BUG_ON(!dn); | 1238 | BUG_ON(!dn); |
| 1265 | BUG_ON(!dir); | 1239 | BUG_ON(!dir); |
| 1266 | BUG_ON(d_inode(dn->d_parent) != dir); | 1240 | BUG_ON(d_inode(dn->d_parent) != dir); |
| 1267 | BUG_ON(ceph_ino(dir) != | 1241 | |
| 1268 | le64_to_cpu(rinfo->diri.in->ino)); | 1242 | dvino.ino = le64_to_cpu(rinfo->diri.in->ino); |
| 1269 | BUG_ON(ceph_snap(dir) != | 1243 | dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); |
| 1270 | le64_to_cpu(rinfo->diri.in->snapid)); | 1244 | |
| 1245 | BUG_ON(ceph_ino(dir) != dvino.ino); | ||
| 1246 | BUG_ON(ceph_snap(dir) != dvino.snap); | ||
| 1271 | 1247 | ||
| 1272 | /* do we have a lease on the whole dir? */ | 1248 | /* do we have a lease on the whole dir? */ |
| 1273 | have_dir_cap = | 1249 | have_dir_cap = |
| @@ -1319,12 +1295,13 @@ retry_lookup: | |||
| 1319 | ceph_dir_clear_ordered(dir); | 1295 | ceph_dir_clear_ordered(dir); |
| 1320 | dout("d_delete %p\n", dn); | 1296 | dout("d_delete %p\n", dn); |
| 1321 | d_delete(dn); | 1297 | d_delete(dn); |
| 1322 | } else { | 1298 | } else if (have_lease) { |
| 1323 | if (have_lease && d_unhashed(dn)) | 1299 | if (d_unhashed(dn)) |
| 1324 | d_add(dn, NULL); | 1300 | d_add(dn, NULL); |
| 1325 | update_dentry_lease(dn, rinfo->dlease, | 1301 | update_dentry_lease(dn, rinfo->dlease, |
| 1326 | session, | 1302 | session, |
| 1327 | req->r_request_started); | 1303 | req->r_request_started, |
| 1304 | NULL, &dvino); | ||
| 1328 | } | 1305 | } |
| 1329 | goto done; | 1306 | goto done; |
| 1330 | } | 1307 | } |
| @@ -1347,15 +1324,19 @@ retry_lookup: | |||
| 1347 | have_lease = false; | 1324 | have_lease = false; |
| 1348 | } | 1325 | } |
| 1349 | 1326 | ||
| 1350 | if (have_lease) | 1327 | if (have_lease) { |
| 1328 | tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); | ||
| 1329 | tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); | ||
| 1351 | update_dentry_lease(dn, rinfo->dlease, session, | 1330 | update_dentry_lease(dn, rinfo->dlease, session, |
| 1352 | req->r_request_started); | 1331 | req->r_request_started, |
| 1332 | &tvino, &dvino); | ||
| 1333 | } | ||
| 1353 | dout(" final dn %p\n", dn); | 1334 | dout(" final dn %p\n", dn); |
| 1354 | } else if (!req->r_aborted && | 1335 | } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP || |
| 1355 | (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || | 1336 | req->r_op == CEPH_MDS_OP_MKSNAP) && |
| 1356 | req->r_op == CEPH_MDS_OP_MKSNAP)) { | 1337 | !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { |
| 1357 | struct dentry *dn = req->r_dentry; | 1338 | struct dentry *dn = req->r_dentry; |
| 1358 | struct inode *dir = req->r_locked_dir; | 1339 | struct inode *dir = req->r_parent; |
| 1359 | 1340 | ||
| 1360 | /* fill out a snapdir LOOKUPSNAP dentry */ | 1341 | /* fill out a snapdir LOOKUPSNAP dentry */ |
| 1361 | BUG_ON(!dn); | 1342 | BUG_ON(!dn); |
| @@ -1370,6 +1351,26 @@ retry_lookup: | |||
| 1370 | goto done; | 1351 | goto done; |
| 1371 | } | 1352 | } |
| 1372 | req->r_dentry = dn; /* may have spliced */ | 1353 | req->r_dentry = dn; /* may have spliced */ |
| 1354 | } else if (rinfo->head->is_dentry) { | ||
| 1355 | struct ceph_vino *ptvino = NULL; | ||
| 1356 | |||
| 1357 | if ((le32_to_cpu(rinfo->diri.in->cap.caps) & CEPH_CAP_FILE_SHARED) || | ||
| 1358 | le32_to_cpu(rinfo->dlease->duration_ms)) { | ||
| 1359 | dvino.ino = le64_to_cpu(rinfo->diri.in->ino); | ||
| 1360 | dvino.snap = le64_to_cpu(rinfo->diri.in->snapid); | ||
| 1361 | |||
| 1362 | if (rinfo->head->is_target) { | ||
| 1363 | tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); | ||
| 1364 | tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); | ||
| 1365 | ptvino = &tvino; | ||
| 1366 | } | ||
| 1367 | |||
| 1368 | update_dentry_lease(req->r_dentry, rinfo->dlease, | ||
| 1369 | session, req->r_request_started, ptvino, | ||
| 1370 | &dvino); | ||
| 1371 | } else { | ||
| 1372 | dout("%s: no dentry lease or dir cap\n", __func__); | ||
| 1373 | } | ||
| 1373 | } | 1374 | } |
| 1374 | done: | 1375 | done: |
| 1375 | dout("fill_trace done err=%d\n", err); | 1376 | dout("fill_trace done err=%d\n", err); |
| @@ -1478,7 +1479,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, | |||
| 1478 | u32 fpos_offset; | 1479 | u32 fpos_offset; |
| 1479 | struct ceph_readdir_cache_control cache_ctl = {}; | 1480 | struct ceph_readdir_cache_control cache_ctl = {}; |
| 1480 | 1481 | ||
| 1481 | if (req->r_aborted) | 1482 | if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) |
| 1482 | return readdir_prepopulate_inodes_only(req, session); | 1483 | return readdir_prepopulate_inodes_only(req, session); |
| 1483 | 1484 | ||
| 1484 | if (rinfo->hash_order && req->r_path2) { | 1485 | if (rinfo->hash_order && req->r_path2) { |
| @@ -1523,14 +1524,14 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, | |||
| 1523 | /* FIXME: release caps/leases if error occurs */ | 1524 | /* FIXME: release caps/leases if error occurs */ |
| 1524 | for (i = 0; i < rinfo->dir_nr; i++) { | 1525 | for (i = 0; i < rinfo->dir_nr; i++) { |
| 1525 | struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; | 1526 | struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; |
| 1526 | struct ceph_vino vino; | 1527 | struct ceph_vino tvino, dvino; |
| 1527 | 1528 | ||
| 1528 | dname.name = rde->name; | 1529 | dname.name = rde->name; |
| 1529 | dname.len = rde->name_len; | 1530 | dname.len = rde->name_len; |
| 1530 | dname.hash = full_name_hash(parent, dname.name, dname.len); | 1531 | dname.hash = full_name_hash(parent, dname.name, dname.len); |
| 1531 | 1532 | ||
| 1532 | vino.ino = le64_to_cpu(rde->inode.in->ino); | 1533 | tvino.ino = le64_to_cpu(rde->inode.in->ino); |
| 1533 | vino.snap = le64_to_cpu(rde->inode.in->snapid); | 1534 | tvino.snap = le64_to_cpu(rde->inode.in->snapid); |
| 1534 | 1535 | ||
| 1535 | if (rinfo->hash_order) { | 1536 | if (rinfo->hash_order) { |
| 1536 | u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, | 1537 | u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, |
| @@ -1559,8 +1560,8 @@ retry_lookup: | |||
| 1559 | goto out; | 1560 | goto out; |
| 1560 | } | 1561 | } |
| 1561 | } else if (d_really_is_positive(dn) && | 1562 | } else if (d_really_is_positive(dn) && |
| 1562 | (ceph_ino(d_inode(dn)) != vino.ino || | 1563 | (ceph_ino(d_inode(dn)) != tvino.ino || |
| 1563 | ceph_snap(d_inode(dn)) != vino.snap)) { | 1564 | ceph_snap(d_inode(dn)) != tvino.snap)) { |
| 1564 | dout(" dn %p points to wrong inode %p\n", | 1565 | dout(" dn %p points to wrong inode %p\n", |
| 1565 | dn, d_inode(dn)); | 1566 | dn, d_inode(dn)); |
| 1566 | d_delete(dn); | 1567 | d_delete(dn); |
| @@ -1572,7 +1573,7 @@ retry_lookup: | |||
| 1572 | if (d_really_is_positive(dn)) { | 1573 | if (d_really_is_positive(dn)) { |
| 1573 | in = d_inode(dn); | 1574 | in = d_inode(dn); |
| 1574 | } else { | 1575 | } else { |
| 1575 | in = ceph_get_inode(parent->d_sb, vino); | 1576 | in = ceph_get_inode(parent->d_sb, tvino); |
| 1576 | if (IS_ERR(in)) { | 1577 | if (IS_ERR(in)) { |
| 1577 | dout("new_inode badness\n"); | 1578 | dout("new_inode badness\n"); |
| 1578 | d_drop(dn); | 1579 | d_drop(dn); |
| @@ -1617,8 +1618,9 @@ retry_lookup: | |||
| 1617 | 1618 | ||
| 1618 | ceph_dentry(dn)->offset = rde->offset; | 1619 | ceph_dentry(dn)->offset = rde->offset; |
| 1619 | 1620 | ||
| 1621 | dvino = ceph_vino(d_inode(parent)); | ||
| 1620 | update_dentry_lease(dn, rde->lease, req->r_session, | 1622 | update_dentry_lease(dn, rde->lease, req->r_session, |
| 1621 | req->r_request_started); | 1623 | req->r_request_started, &tvino, &dvino); |
| 1622 | 1624 | ||
| 1623 | if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { | 1625 | if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { |
| 1624 | ret = fill_readdir_cache(d_inode(parent), dn, | 1626 | ret = fill_readdir_cache(d_inode(parent), dn, |
| @@ -1632,7 +1634,7 @@ next_item: | |||
| 1632 | } | 1634 | } |
| 1633 | out: | 1635 | out: |
| 1634 | if (err == 0 && skipped == 0) { | 1636 | if (err == 0 && skipped == 0) { |
| 1635 | req->r_did_prepopulate = true; | 1637 | set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags); |
| 1636 | req->r_readdir_cache_idx = cache_ctl.index; | 1638 | req->r_readdir_cache_idx = cache_ctl.index; |
| 1637 | } | 1639 | } |
| 1638 | ceph_readdir_cache_release(&cache_ctl); | 1640 | ceph_readdir_cache_release(&cache_ctl); |
| @@ -1720,7 +1722,7 @@ static void ceph_invalidate_work(struct work_struct *work) | |||
| 1720 | 1722 | ||
| 1721 | mutex_lock(&ci->i_truncate_mutex); | 1723 | mutex_lock(&ci->i_truncate_mutex); |
| 1722 | 1724 | ||
| 1723 | if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { | 1725 | if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { |
| 1724 | pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", | 1726 | pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", |
| 1725 | inode, ceph_ino(inode)); | 1727 | inode, ceph_ino(inode)); |
| 1726 | mapping_set_error(inode->i_mapping, -EIO); | 1728 | mapping_set_error(inode->i_mapping, -EIO); |
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 7d752d53353a..4c9c72f26eb9 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c | |||
| @@ -25,7 +25,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) | |||
| 25 | l.stripe_count = ci->i_layout.stripe_count; | 25 | l.stripe_count = ci->i_layout.stripe_count; |
| 26 | l.object_size = ci->i_layout.object_size; | 26 | l.object_size = ci->i_layout.object_size; |
| 27 | l.data_pool = ci->i_layout.pool_id; | 27 | l.data_pool = ci->i_layout.pool_id; |
| 28 | l.preferred_osd = (s32)-1; | 28 | l.preferred_osd = -1; |
| 29 | if (copy_to_user(arg, &l, sizeof(l))) | 29 | if (copy_to_user(arg, &l, sizeof(l))) |
| 30 | return -EFAULT; | 30 | return -EFAULT; |
| 31 | } | 31 | } |
| @@ -97,7 +97,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) | |||
| 97 | nl.data_pool = ci->i_layout.pool_id; | 97 | nl.data_pool = ci->i_layout.pool_id; |
| 98 | 98 | ||
| 99 | /* this is obsolete, and always -1 */ | 99 | /* this is obsolete, and always -1 */ |
| 100 | nl.preferred_osd = le64_to_cpu(-1); | 100 | nl.preferred_osd = -1; |
| 101 | 101 | ||
| 102 | err = __validate_layout(mdsc, &nl); | 102 | err = __validate_layout(mdsc, &nl); |
| 103 | if (err) | 103 | if (err) |
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c9d2e553a6c4..c681762d76e6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
| @@ -547,8 +547,8 @@ void ceph_mdsc_release_request(struct kref *kref) | |||
| 547 | ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); | 547 | ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); |
| 548 | iput(req->r_inode); | 548 | iput(req->r_inode); |
| 549 | } | 549 | } |
| 550 | if (req->r_locked_dir) | 550 | if (req->r_parent) |
| 551 | ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); | 551 | ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); |
| 552 | iput(req->r_target_inode); | 552 | iput(req->r_target_inode); |
| 553 | if (req->r_dentry) | 553 | if (req->r_dentry) |
| 554 | dput(req->r_dentry); | 554 | dput(req->r_dentry); |
| @@ -628,6 +628,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc, | |||
| 628 | { | 628 | { |
| 629 | dout("__unregister_request %p tid %lld\n", req, req->r_tid); | 629 | dout("__unregister_request %p tid %lld\n", req, req->r_tid); |
| 630 | 630 | ||
| 631 | /* Never leave an unregistered request on an unsafe list! */ | ||
| 632 | list_del_init(&req->r_unsafe_item); | ||
| 633 | |||
| 631 | if (req->r_tid == mdsc->oldest_tid) { | 634 | if (req->r_tid == mdsc->oldest_tid) { |
| 632 | struct rb_node *p = rb_next(&req->r_node); | 635 | struct rb_node *p = rb_next(&req->r_node); |
| 633 | mdsc->oldest_tid = 0; | 636 | mdsc->oldest_tid = 0; |
| @@ -644,13 +647,15 @@ static void __unregister_request(struct ceph_mds_client *mdsc, | |||
| 644 | 647 | ||
| 645 | erase_request(&mdsc->request_tree, req); | 648 | erase_request(&mdsc->request_tree, req); |
| 646 | 649 | ||
| 647 | if (req->r_unsafe_dir && req->r_got_unsafe) { | 650 | if (req->r_unsafe_dir && |
| 651 | test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { | ||
| 648 | struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); | 652 | struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); |
| 649 | spin_lock(&ci->i_unsafe_lock); | 653 | spin_lock(&ci->i_unsafe_lock); |
| 650 | list_del_init(&req->r_unsafe_dir_item); | 654 | list_del_init(&req->r_unsafe_dir_item); |
| 651 | spin_unlock(&ci->i_unsafe_lock); | 655 | spin_unlock(&ci->i_unsafe_lock); |
| 652 | } | 656 | } |
| 653 | if (req->r_target_inode && req->r_got_unsafe) { | 657 | if (req->r_target_inode && |
| 658 | test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { | ||
| 654 | struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); | 659 | struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); |
| 655 | spin_lock(&ci->i_unsafe_lock); | 660 | spin_lock(&ci->i_unsafe_lock); |
| 656 | list_del_init(&req->r_unsafe_target_item); | 661 | list_del_init(&req->r_unsafe_target_item); |
| @@ -668,6 +673,28 @@ static void __unregister_request(struct ceph_mds_client *mdsc, | |||
| 668 | } | 673 | } |
| 669 | 674 | ||
| 670 | /* | 675 | /* |
| 676 | * Walk back up the dentry tree until we hit a dentry representing a | ||
| 677 | * non-snapshot inode. We do this using the rcu_read_lock (which must be held | ||
| 678 | * when calling this) to ensure that the objects won't disappear while we're | ||
| 679 | * working with them. Once we hit a candidate dentry, we attempt to take a | ||
| 680 | * reference to it, and return that as the result. | ||
| 681 | */ | ||
| 682 | static struct inode *get_nonsnap_parent(struct dentry *dentry) | ||
| 683 | { | ||
| 684 | struct inode *inode = NULL; | ||
| 685 | |||
| 686 | while (dentry && !IS_ROOT(dentry)) { | ||
| 687 | inode = d_inode_rcu(dentry); | ||
| 688 | if (!inode || ceph_snap(inode) == CEPH_NOSNAP) | ||
| 689 | break; | ||
| 690 | dentry = dentry->d_parent; | ||
| 691 | } | ||
| 692 | if (inode) | ||
| 693 | inode = igrab(inode); | ||
| 694 | return inode; | ||
| 695 | } | ||
| 696 | |||
| 697 | /* | ||
| 671 | * Choose mds to send request to next. If there is a hint set in the | 698 | * Choose mds to send request to next. If there is a hint set in the |
| 672 | * request (e.g., due to a prior forward hint from the mds), use that. | 699 | * request (e.g., due to a prior forward hint from the mds), use that. |
| 673 | * Otherwise, consult frag tree and/or caps to identify the | 700 | * Otherwise, consult frag tree and/or caps to identify the |
| @@ -675,19 +702,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc, | |||
| 675 | * | 702 | * |
| 676 | * Called under mdsc->mutex. | 703 | * Called under mdsc->mutex. |
| 677 | */ | 704 | */ |
| 678 | static struct dentry *get_nonsnap_parent(struct dentry *dentry) | ||
| 679 | { | ||
| 680 | /* | ||
| 681 | * we don't need to worry about protecting the d_parent access | ||
| 682 | * here because we never renaming inside the snapped namespace | ||
| 683 | * except to resplice to another snapdir, and either the old or new | ||
| 684 | * result is a valid result. | ||
| 685 | */ | ||
| 686 | while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) | ||
| 687 | dentry = dentry->d_parent; | ||
| 688 | return dentry; | ||
| 689 | } | ||
| 690 | |||
| 691 | static int __choose_mds(struct ceph_mds_client *mdsc, | 705 | static int __choose_mds(struct ceph_mds_client *mdsc, |
| 692 | struct ceph_mds_request *req) | 706 | struct ceph_mds_request *req) |
| 693 | { | 707 | { |
| @@ -697,7 +711,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, | |||
| 697 | int mode = req->r_direct_mode; | 711 | int mode = req->r_direct_mode; |
| 698 | int mds = -1; | 712 | int mds = -1; |
| 699 | u32 hash = req->r_direct_hash; | 713 | u32 hash = req->r_direct_hash; |
| 700 | bool is_hash = req->r_direct_is_hash; | 714 | bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); |
| 701 | 715 | ||
| 702 | /* | 716 | /* |
| 703 | * is there a specific mds we should try? ignore hint if we have | 717 | * is there a specific mds we should try? ignore hint if we have |
| @@ -717,30 +731,39 @@ static int __choose_mds(struct ceph_mds_client *mdsc, | |||
| 717 | inode = NULL; | 731 | inode = NULL; |
| 718 | if (req->r_inode) { | 732 | if (req->r_inode) { |
| 719 | inode = req->r_inode; | 733 | inode = req->r_inode; |
| 734 | ihold(inode); | ||
| 720 | } else if (req->r_dentry) { | 735 | } else if (req->r_dentry) { |
| 721 | /* ignore race with rename; old or new d_parent is okay */ | 736 | /* ignore race with rename; old or new d_parent is okay */ |
| 722 | struct dentry *parent = req->r_dentry->d_parent; | 737 | struct dentry *parent; |
| 723 | struct inode *dir = d_inode(parent); | 738 | struct inode *dir; |
| 739 | |||
| 740 | rcu_read_lock(); | ||
| 741 | parent = req->r_dentry->d_parent; | ||
| 742 | dir = req->r_parent ? : d_inode_rcu(parent); | ||
| 724 | 743 | ||
| 725 | if (dir->i_sb != mdsc->fsc->sb) { | 744 | if (!dir || dir->i_sb != mdsc->fsc->sb) { |
| 726 | /* not this fs! */ | 745 | /* not this fs or parent went negative */ |
| 727 | inode = d_inode(req->r_dentry); | 746 | inode = d_inode(req->r_dentry); |
| 747 | if (inode) | ||
| 748 | ihold(inode); | ||
| 728 | } else if (ceph_snap(dir) != CEPH_NOSNAP) { | 749 | } else if (ceph_snap(dir) != CEPH_NOSNAP) { |
| 729 | /* direct snapped/virtual snapdir requests | 750 | /* direct snapped/virtual snapdir requests |
| 730 | * based on parent dir inode */ | 751 | * based on parent dir inode */ |
| 731 | struct dentry *dn = get_nonsnap_parent(parent); | 752 | inode = get_nonsnap_parent(parent); |
| 732 | inode = d_inode(dn); | ||
| 733 | dout("__choose_mds using nonsnap parent %p\n", inode); | 753 | dout("__choose_mds using nonsnap parent %p\n", inode); |
| 734 | } else { | 754 | } else { |
| 735 | /* dentry target */ | 755 | /* dentry target */ |
| 736 | inode = d_inode(req->r_dentry); | 756 | inode = d_inode(req->r_dentry); |
| 737 | if (!inode || mode == USE_AUTH_MDS) { | 757 | if (!inode || mode == USE_AUTH_MDS) { |
| 738 | /* dir + name */ | 758 | /* dir + name */ |
| 739 | inode = dir; | 759 | inode = igrab(dir); |
| 740 | hash = ceph_dentry_hash(dir, req->r_dentry); | 760 | hash = ceph_dentry_hash(dir, req->r_dentry); |
| 741 | is_hash = true; | 761 | is_hash = true; |
| 762 | } else { | ||
| 763 | ihold(inode); | ||
| 742 | } | 764 | } |
| 743 | } | 765 | } |
| 766 | rcu_read_unlock(); | ||
| 744 | } | 767 | } |
| 745 | 768 | ||
| 746 | dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, | 769 | dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, |
| @@ -769,7 +792,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, | |||
| 769 | (int)r, frag.ndist); | 792 | (int)r, frag.ndist); |
| 770 | if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= | 793 | if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= |
| 771 | CEPH_MDS_STATE_ACTIVE) | 794 | CEPH_MDS_STATE_ACTIVE) |
| 772 | return mds; | 795 | goto out; |
| 773 | } | 796 | } |
| 774 | 797 | ||
| 775 | /* since this file/dir wasn't known to be | 798 | /* since this file/dir wasn't known to be |
| @@ -784,7 +807,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, | |||
| 784 | inode, ceph_vinop(inode), frag.frag, mds); | 807 | inode, ceph_vinop(inode), frag.frag, mds); |
| 785 | if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= | 808 | if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= |
| 786 | CEPH_MDS_STATE_ACTIVE) | 809 | CEPH_MDS_STATE_ACTIVE) |
| 787 | return mds; | 810 | goto out; |
| 788 | } | 811 | } |
| 789 | } | 812 | } |
| 790 | } | 813 | } |
| @@ -797,6 +820,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, | |||
| 797 | cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); | 820 | cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); |
| 798 | if (!cap) { | 821 | if (!cap) { |
| 799 | spin_unlock(&ci->i_ceph_lock); | 822 | spin_unlock(&ci->i_ceph_lock); |
| 823 | iput(inode); | ||
| 800 | goto random; | 824 | goto random; |
| 801 | } | 825 | } |
| 802 | mds = cap->session->s_mds; | 826 | mds = cap->session->s_mds; |
| @@ -804,6 +828,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc, | |||
| 804 | inode, ceph_vinop(inode), mds, | 828 | inode, ceph_vinop(inode), mds, |
| 805 | cap == ci->i_auth_cap ? "auth " : "", cap); | 829 | cap == ci->i_auth_cap ? "auth " : "", cap); |
| 806 | spin_unlock(&ci->i_ceph_lock); | 830 | spin_unlock(&ci->i_ceph_lock); |
| 831 | out: | ||
| 832 | iput(inode); | ||
| 807 | return mds; | 833 | return mds; |
| 808 | 834 | ||
| 809 | random: | 835 | random: |
| @@ -1036,7 +1062,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, | |||
| 1036 | while (!list_empty(&session->s_unsafe)) { | 1062 | while (!list_empty(&session->s_unsafe)) { |
| 1037 | req = list_first_entry(&session->s_unsafe, | 1063 | req = list_first_entry(&session->s_unsafe, |
| 1038 | struct ceph_mds_request, r_unsafe_item); | 1064 | struct ceph_mds_request, r_unsafe_item); |
| 1039 | list_del_init(&req->r_unsafe_item); | ||
| 1040 | pr_warn_ratelimited(" dropping unsafe request %llu\n", | 1065 | pr_warn_ratelimited(" dropping unsafe request %llu\n", |
| 1041 | req->r_tid); | 1066 | req->r_tid); |
| 1042 | __unregister_request(mdsc, req); | 1067 | __unregister_request(mdsc, req); |
| @@ -1146,7 +1171,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
| 1146 | ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; | 1171 | ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; |
| 1147 | 1172 | ||
| 1148 | if (ci->i_wrbuffer_ref > 0 && | 1173 | if (ci->i_wrbuffer_ref > 0 && |
| 1149 | ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) | 1174 | READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) |
| 1150 | invalidate = true; | 1175 | invalidate = true; |
| 1151 | 1176 | ||
| 1152 | while (!list_empty(&ci->i_cap_flush_list)) { | 1177 | while (!list_empty(&ci->i_cap_flush_list)) { |
| @@ -1775,18 +1800,23 @@ retry: | |||
| 1775 | return path; | 1800 | return path; |
| 1776 | } | 1801 | } |
| 1777 | 1802 | ||
| 1778 | static int build_dentry_path(struct dentry *dentry, | 1803 | static int build_dentry_path(struct dentry *dentry, struct inode *dir, |
| 1779 | const char **ppath, int *ppathlen, u64 *pino, | 1804 | const char **ppath, int *ppathlen, u64 *pino, |
| 1780 | int *pfreepath) | 1805 | int *pfreepath) |
| 1781 | { | 1806 | { |
| 1782 | char *path; | 1807 | char *path; |
| 1783 | 1808 | ||
| 1784 | if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) { | 1809 | rcu_read_lock(); |
| 1785 | *pino = ceph_ino(d_inode(dentry->d_parent)); | 1810 | if (!dir) |
| 1811 | dir = d_inode_rcu(dentry->d_parent); | ||
| 1812 | if (dir && ceph_snap(dir) == CEPH_NOSNAP) { | ||
| 1813 | *pino = ceph_ino(dir); | ||
| 1814 | rcu_read_unlock(); | ||
| 1786 | *ppath = dentry->d_name.name; | 1815 | *ppath = dentry->d_name.name; |
| 1787 | *ppathlen = dentry->d_name.len; | 1816 | *ppathlen = dentry->d_name.len; |
| 1788 | return 0; | 1817 | return 0; |
| 1789 | } | 1818 | } |
| 1819 | rcu_read_unlock(); | ||
| 1790 | path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); | 1820 | path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1); |
| 1791 | if (IS_ERR(path)) | 1821 | if (IS_ERR(path)) |
| 1792 | return PTR_ERR(path); | 1822 | return PTR_ERR(path); |
| @@ -1822,8 +1852,8 @@ static int build_inode_path(struct inode *inode, | |||
| 1822 | * an explicit ino+path. | 1852 | * an explicit ino+path. |
| 1823 | */ | 1853 | */ |
| 1824 | static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, | 1854 | static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, |
| 1825 | const char *rpath, u64 rino, | 1855 | struct inode *rdiri, const char *rpath, |
| 1826 | const char **ppath, int *pathlen, | 1856 | u64 rino, const char **ppath, int *pathlen, |
| 1827 | u64 *ino, int *freepath) | 1857 | u64 *ino, int *freepath) |
| 1828 | { | 1858 | { |
| 1829 | int r = 0; | 1859 | int r = 0; |
| @@ -1833,7 +1863,8 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, | |||
| 1833 | dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), | 1863 | dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode), |
| 1834 | ceph_snap(rinode)); | 1864 | ceph_snap(rinode)); |
| 1835 | } else if (rdentry) { | 1865 | } else if (rdentry) { |
| 1836 | r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath); | 1866 | r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino, |
| 1867 | freepath); | ||
| 1837 | dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, | 1868 | dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen, |
| 1838 | *ppath); | 1869 | *ppath); |
| 1839 | } else if (rpath || rino) { | 1870 | } else if (rpath || rino) { |
| @@ -1866,7 +1897,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, | |||
| 1866 | int ret; | 1897 | int ret; |
| 1867 | 1898 | ||
| 1868 | ret = set_request_path_attr(req->r_inode, req->r_dentry, | 1899 | ret = set_request_path_attr(req->r_inode, req->r_dentry, |
| 1869 | req->r_path1, req->r_ino1.ino, | 1900 | req->r_parent, req->r_path1, req->r_ino1.ino, |
| 1870 | &path1, &pathlen1, &ino1, &freepath1); | 1901 | &path1, &pathlen1, &ino1, &freepath1); |
| 1871 | if (ret < 0) { | 1902 | if (ret < 0) { |
| 1872 | msg = ERR_PTR(ret); | 1903 | msg = ERR_PTR(ret); |
| @@ -1874,6 +1905,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, | |||
| 1874 | } | 1905 | } |
| 1875 | 1906 | ||
| 1876 | ret = set_request_path_attr(NULL, req->r_old_dentry, | 1907 | ret = set_request_path_attr(NULL, req->r_old_dentry, |
| 1908 | req->r_old_dentry_dir, | ||
| 1877 | req->r_path2, req->r_ino2.ino, | 1909 | req->r_path2, req->r_ino2.ino, |
| 1878 | &path2, &pathlen2, &ino2, &freepath2); | 1910 | &path2, &pathlen2, &ino2, &freepath2); |
| 1879 | if (ret < 0) { | 1911 | if (ret < 0) { |
| @@ -1927,10 +1959,13 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, | |||
| 1927 | mds, req->r_inode_drop, req->r_inode_unless, 0); | 1959 | mds, req->r_inode_drop, req->r_inode_unless, 0); |
| 1928 | if (req->r_dentry_drop) | 1960 | if (req->r_dentry_drop) |
| 1929 | releases += ceph_encode_dentry_release(&p, req->r_dentry, | 1961 | releases += ceph_encode_dentry_release(&p, req->r_dentry, |
| 1930 | mds, req->r_dentry_drop, req->r_dentry_unless); | 1962 | req->r_parent, mds, req->r_dentry_drop, |
| 1963 | req->r_dentry_unless); | ||
| 1931 | if (req->r_old_dentry_drop) | 1964 | if (req->r_old_dentry_drop) |
| 1932 | releases += ceph_encode_dentry_release(&p, req->r_old_dentry, | 1965 | releases += ceph_encode_dentry_release(&p, req->r_old_dentry, |
| 1933 | mds, req->r_old_dentry_drop, req->r_old_dentry_unless); | 1966 | req->r_old_dentry_dir, mds, |
| 1967 | req->r_old_dentry_drop, | ||
| 1968 | req->r_old_dentry_unless); | ||
| 1934 | if (req->r_old_inode_drop) | 1969 | if (req->r_old_inode_drop) |
| 1935 | releases += ceph_encode_inode_release(&p, | 1970 | releases += ceph_encode_inode_release(&p, |
| 1936 | d_inode(req->r_old_dentry), | 1971 | d_inode(req->r_old_dentry), |
| @@ -2012,7 +2047,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, | |||
| 2012 | dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, | 2047 | dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, |
| 2013 | req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); | 2048 | req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); |
| 2014 | 2049 | ||
| 2015 | if (req->r_got_unsafe) { | 2050 | if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { |
| 2016 | void *p; | 2051 | void *p; |
| 2017 | /* | 2052 | /* |
| 2018 | * Replay. Do not regenerate message (and rebuild | 2053 | * Replay. Do not regenerate message (and rebuild |
| @@ -2061,16 +2096,16 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, | |||
| 2061 | 2096 | ||
| 2062 | rhead = msg->front.iov_base; | 2097 | rhead = msg->front.iov_base; |
| 2063 | rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); | 2098 | rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); |
| 2064 | if (req->r_got_unsafe) | 2099 | if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) |
| 2065 | flags |= CEPH_MDS_FLAG_REPLAY; | 2100 | flags |= CEPH_MDS_FLAG_REPLAY; |
| 2066 | if (req->r_locked_dir) | 2101 | if (req->r_parent) |
| 2067 | flags |= CEPH_MDS_FLAG_WANT_DENTRY; | 2102 | flags |= CEPH_MDS_FLAG_WANT_DENTRY; |
| 2068 | rhead->flags = cpu_to_le32(flags); | 2103 | rhead->flags = cpu_to_le32(flags); |
| 2069 | rhead->num_fwd = req->r_num_fwd; | 2104 | rhead->num_fwd = req->r_num_fwd; |
| 2070 | rhead->num_retry = req->r_attempts - 1; | 2105 | rhead->num_retry = req->r_attempts - 1; |
| 2071 | rhead->ino = 0; | 2106 | rhead->ino = 0; |
| 2072 | 2107 | ||
| 2073 | dout(" r_locked_dir = %p\n", req->r_locked_dir); | 2108 | dout(" r_parent = %p\n", req->r_parent); |
| 2074 | return 0; | 2109 | return 0; |
| 2075 | } | 2110 | } |
| 2076 | 2111 | ||
| @@ -2084,8 +2119,8 @@ static int __do_request(struct ceph_mds_client *mdsc, | |||
| 2084 | int mds = -1; | 2119 | int mds = -1; |
| 2085 | int err = 0; | 2120 | int err = 0; |
| 2086 | 2121 | ||
| 2087 | if (req->r_err || req->r_got_result) { | 2122 | if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { |
| 2088 | if (req->r_aborted) | 2123 | if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) |
| 2089 | __unregister_request(mdsc, req); | 2124 | __unregister_request(mdsc, req); |
| 2090 | goto out; | 2125 | goto out; |
| 2091 | } | 2126 | } |
| @@ -2096,12 +2131,12 @@ static int __do_request(struct ceph_mds_client *mdsc, | |||
| 2096 | err = -EIO; | 2131 | err = -EIO; |
| 2097 | goto finish; | 2132 | goto finish; |
| 2098 | } | 2133 | } |
| 2099 | if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { | 2134 | if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { |
| 2100 | dout("do_request forced umount\n"); | 2135 | dout("do_request forced umount\n"); |
| 2101 | err = -EIO; | 2136 | err = -EIO; |
| 2102 | goto finish; | 2137 | goto finish; |
| 2103 | } | 2138 | } |
| 2104 | if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) { | 2139 | if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) { |
| 2105 | if (mdsc->mdsmap_err) { | 2140 | if (mdsc->mdsmap_err) { |
| 2106 | err = mdsc->mdsmap_err; | 2141 | err = mdsc->mdsmap_err; |
| 2107 | dout("do_request mdsmap err %d\n", err); | 2142 | dout("do_request mdsmap err %d\n", err); |
| @@ -2215,7 +2250,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) | |||
| 2215 | while (p) { | 2250 | while (p) { |
| 2216 | req = rb_entry(p, struct ceph_mds_request, r_node); | 2251 | req = rb_entry(p, struct ceph_mds_request, r_node); |
| 2217 | p = rb_next(p); | 2252 | p = rb_next(p); |
| 2218 | if (req->r_got_unsafe) | 2253 | if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) |
| 2219 | continue; | 2254 | continue; |
| 2220 | if (req->r_attempts > 0) | 2255 | if (req->r_attempts > 0) |
| 2221 | continue; /* only new requests */ | 2256 | continue; /* only new requests */ |
| @@ -2250,11 +2285,11 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, | |||
| 2250 | 2285 | ||
| 2251 | dout("do_request on %p\n", req); | 2286 | dout("do_request on %p\n", req); |
| 2252 | 2287 | ||
| 2253 | /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */ | 2288 | /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ |
| 2254 | if (req->r_inode) | 2289 | if (req->r_inode) |
| 2255 | ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); | 2290 | ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); |
| 2256 | if (req->r_locked_dir) | 2291 | if (req->r_parent) |
| 2257 | ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); | 2292 | ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); |
| 2258 | if (req->r_old_dentry_dir) | 2293 | if (req->r_old_dentry_dir) |
| 2259 | ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), | 2294 | ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), |
| 2260 | CEPH_CAP_PIN); | 2295 | CEPH_CAP_PIN); |
| @@ -2289,7 +2324,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, | |||
| 2289 | mutex_lock(&mdsc->mutex); | 2324 | mutex_lock(&mdsc->mutex); |
| 2290 | 2325 | ||
| 2291 | /* only abort if we didn't race with a real reply */ | 2326 | /* only abort if we didn't race with a real reply */ |
| 2292 | if (req->r_got_result) { | 2327 | if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { |
| 2293 | err = le32_to_cpu(req->r_reply_info.head->result); | 2328 | err = le32_to_cpu(req->r_reply_info.head->result); |
| 2294 | } else if (err < 0) { | 2329 | } else if (err < 0) { |
| 2295 | dout("aborted request %lld with %d\n", req->r_tid, err); | 2330 | dout("aborted request %lld with %d\n", req->r_tid, err); |
| @@ -2301,10 +2336,10 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, | |||
| 2301 | */ | 2336 | */ |
| 2302 | mutex_lock(&req->r_fill_mutex); | 2337 | mutex_lock(&req->r_fill_mutex); |
| 2303 | req->r_err = err; | 2338 | req->r_err = err; |
| 2304 | req->r_aborted = true; | 2339 | set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); |
| 2305 | mutex_unlock(&req->r_fill_mutex); | 2340 | mutex_unlock(&req->r_fill_mutex); |
| 2306 | 2341 | ||
| 2307 | if (req->r_locked_dir && | 2342 | if (req->r_parent && |
| 2308 | (req->r_op & CEPH_MDS_OP_WRITE)) | 2343 | (req->r_op & CEPH_MDS_OP_WRITE)) |
| 2309 | ceph_invalidate_dir_request(req); | 2344 | ceph_invalidate_dir_request(req); |
| 2310 | } else { | 2345 | } else { |
| @@ -2323,7 +2358,7 @@ out: | |||
| 2323 | */ | 2358 | */ |
| 2324 | void ceph_invalidate_dir_request(struct ceph_mds_request *req) | 2359 | void ceph_invalidate_dir_request(struct ceph_mds_request *req) |
| 2325 | { | 2360 | { |
| 2326 | struct inode *inode = req->r_locked_dir; | 2361 | struct inode *inode = req->r_parent; |
| 2327 | 2362 | ||
| 2328 | dout("invalidate_dir_request %p (complete, lease(s))\n", inode); | 2363 | dout("invalidate_dir_request %p (complete, lease(s))\n", inode); |
| 2329 | 2364 | ||
| @@ -2379,14 +2414,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) | |||
| 2379 | } | 2414 | } |
| 2380 | 2415 | ||
| 2381 | /* dup? */ | 2416 | /* dup? */ |
| 2382 | if ((req->r_got_unsafe && !head->safe) || | 2417 | if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) || |
| 2383 | (req->r_got_safe && head->safe)) { | 2418 | (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) { |
| 2384 | pr_warn("got a dup %s reply on %llu from mds%d\n", | 2419 | pr_warn("got a dup %s reply on %llu from mds%d\n", |
| 2385 | head->safe ? "safe" : "unsafe", tid, mds); | 2420 | head->safe ? "safe" : "unsafe", tid, mds); |
| 2386 | mutex_unlock(&mdsc->mutex); | 2421 | mutex_unlock(&mdsc->mutex); |
| 2387 | goto out; | 2422 | goto out; |
| 2388 | } | 2423 | } |
| 2389 | if (req->r_got_safe) { | 2424 | if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) { |
| 2390 | pr_warn("got unsafe after safe on %llu from mds%d\n", | 2425 | pr_warn("got unsafe after safe on %llu from mds%d\n", |
| 2391 | tid, mds); | 2426 | tid, mds); |
| 2392 | mutex_unlock(&mdsc->mutex); | 2427 | mutex_unlock(&mdsc->mutex); |
| @@ -2425,10 +2460,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) | |||
| 2425 | 2460 | ||
| 2426 | 2461 | ||
| 2427 | if (head->safe) { | 2462 | if (head->safe) { |
| 2428 | req->r_got_safe = true; | 2463 | set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); |
| 2429 | __unregister_request(mdsc, req); | 2464 | __unregister_request(mdsc, req); |
| 2430 | 2465 | ||
| 2431 | if (req->r_got_unsafe) { | 2466 | if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { |
| 2432 | /* | 2467 | /* |
| 2433 | * We already handled the unsafe response, now do the | 2468 | * We already handled the unsafe response, now do the |
| 2434 | * cleanup. No need to examine the response; the MDS | 2469 | * cleanup. No need to examine the response; the MDS |
| @@ -2437,7 +2472,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) | |||
| 2437 | * useful we could do with a revised return value. | 2472 | * useful we could do with a revised return value. |
| 2438 | */ | 2473 | */ |
| 2439 | dout("got safe reply %llu, mds%d\n", tid, mds); | 2474 | dout("got safe reply %llu, mds%d\n", tid, mds); |
| 2440 | list_del_init(&req->r_unsafe_item); | ||
| 2441 | 2475 | ||
| 2442 | /* last unsafe request during umount? */ | 2476 | /* last unsafe request during umount? */ |
| 2443 | if (mdsc->stopping && !__get_oldest_req(mdsc)) | 2477 | if (mdsc->stopping && !__get_oldest_req(mdsc)) |
| @@ -2446,7 +2480,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) | |||
| 2446 | goto out; | 2480 | goto out; |
| 2447 | } | 2481 | } |
| 2448 | } else { | 2482 | } else { |
| 2449 | req->r_got_unsafe = true; | 2483 | set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags); |
| 2450 | list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); | 2484 | list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); |
| 2451 | if (req->r_unsafe_dir) { | 2485 | if (req->r_unsafe_dir) { |
| 2452 | struct ceph_inode_info *ci = | 2486 | struct ceph_inode_info *ci = |
| @@ -2486,7 +2520,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) | |||
| 2486 | /* insert trace into our cache */ | 2520 | /* insert trace into our cache */ |
| 2487 | mutex_lock(&req->r_fill_mutex); | 2521 | mutex_lock(&req->r_fill_mutex); |
| 2488 | current->journal_info = req; | 2522 | current->journal_info = req; |
| 2489 | err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); | 2523 | err = ceph_fill_trace(mdsc->fsc->sb, req); |
| 2490 | if (err == 0) { | 2524 | if (err == 0) { |
| 2491 | if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || | 2525 | if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || |
| 2492 | req->r_op == CEPH_MDS_OP_LSSNAP)) | 2526 | req->r_op == CEPH_MDS_OP_LSSNAP)) |
| @@ -2500,7 +2534,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) | |||
| 2500 | if (realm) | 2534 | if (realm) |
| 2501 | ceph_put_snap_realm(mdsc, realm); | 2535 | ceph_put_snap_realm(mdsc, realm); |
| 2502 | 2536 | ||
| 2503 | if (err == 0 && req->r_got_unsafe && req->r_target_inode) { | 2537 | if (err == 0 && req->r_target_inode && |
| 2538 | test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { | ||
| 2504 | struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); | 2539 | struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); |
| 2505 | spin_lock(&ci->i_unsafe_lock); | 2540 | spin_lock(&ci->i_unsafe_lock); |
| 2506 | list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops); | 2541 | list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops); |
| @@ -2508,12 +2543,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) | |||
| 2508 | } | 2543 | } |
| 2509 | out_err: | 2544 | out_err: |
| 2510 | mutex_lock(&mdsc->mutex); | 2545 | mutex_lock(&mdsc->mutex); |
| 2511 | if (!req->r_aborted) { | 2546 | if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { |
| 2512 | if (err) { | 2547 | if (err) { |
| 2513 | req->r_err = err; | 2548 | req->r_err = err; |
| 2514 | } else { | 2549 | } else { |
| 2515 | req->r_reply = ceph_msg_get(msg); | 2550 | req->r_reply = ceph_msg_get(msg); |
| 2516 | req->r_got_result = true; | 2551 | set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags); |
| 2517 | } | 2552 | } |
| 2518 | } else { | 2553 | } else { |
| 2519 | dout("reply arrived after request %lld was aborted\n", tid); | 2554 | dout("reply arrived after request %lld was aborted\n", tid); |
| @@ -2557,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, | |||
| 2557 | goto out; /* dup reply? */ | 2592 | goto out; /* dup reply? */ |
| 2558 | } | 2593 | } |
| 2559 | 2594 | ||
| 2560 | if (req->r_aborted) { | 2595 | if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { |
| 2561 | dout("forward tid %llu aborted, unregistering\n", tid); | 2596 | dout("forward tid %llu aborted, unregistering\n", tid); |
| 2562 | __unregister_request(mdsc, req); | 2597 | __unregister_request(mdsc, req); |
| 2563 | } else if (fwd_seq <= req->r_num_fwd) { | 2598 | } else if (fwd_seq <= req->r_num_fwd) { |
| @@ -2567,7 +2602,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, | |||
| 2567 | /* resend. forward race not possible; mds would drop */ | 2602 | /* resend. forward race not possible; mds would drop */ |
| 2568 | dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); | 2603 | dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); |
| 2569 | BUG_ON(req->r_err); | 2604 | BUG_ON(req->r_err); |
| 2570 | BUG_ON(req->r_got_result); | 2605 | BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)); |
| 2571 | req->r_attempts = 0; | 2606 | req->r_attempts = 0; |
| 2572 | req->r_num_fwd = fwd_seq; | 2607 | req->r_num_fwd = fwd_seq; |
| 2573 | req->r_resend_mds = next_mds; | 2608 | req->r_resend_mds = next_mds; |
| @@ -2732,7 +2767,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, | |||
| 2732 | while (p) { | 2767 | while (p) { |
| 2733 | req = rb_entry(p, struct ceph_mds_request, r_node); | 2768 | req = rb_entry(p, struct ceph_mds_request, r_node); |
| 2734 | p = rb_next(p); | 2769 | p = rb_next(p); |
| 2735 | if (req->r_got_unsafe) | 2770 | if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) |
| 2736 | continue; | 2771 | continue; |
| 2737 | if (req->r_attempts == 0) | 2772 | if (req->r_attempts == 0) |
| 2738 | continue; /* only old requests */ | 2773 | continue; /* only old requests */ |
| @@ -3556,7 +3591,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) | |||
| 3556 | { | 3591 | { |
| 3557 | u64 want_tid, want_flush; | 3592 | u64 want_tid, want_flush; |
| 3558 | 3593 | ||
| 3559 | if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) | 3594 | if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) |
| 3560 | return; | 3595 | return; |
| 3561 | 3596 | ||
| 3562 | dout("sync\n"); | 3597 | dout("sync\n"); |
| @@ -3587,7 +3622,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) | |||
| 3587 | */ | 3622 | */ |
| 3588 | static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) | 3623 | static bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped) |
| 3589 | { | 3624 | { |
| 3590 | if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) | 3625 | if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) |
| 3591 | return true; | 3626 | return true; |
| 3592 | return atomic_read(&mdsc->num_sessions) <= skipped; | 3627 | return atomic_read(&mdsc->num_sessions) <= skipped; |
| 3593 | } | 3628 | } |
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 3c6f77b7bb02..ac0475a2daa7 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h | |||
| @@ -202,9 +202,18 @@ struct ceph_mds_request { | |||
| 202 | char *r_path1, *r_path2; | 202 | char *r_path1, *r_path2; |
| 203 | struct ceph_vino r_ino1, r_ino2; | 203 | struct ceph_vino r_ino1, r_ino2; |
| 204 | 204 | ||
| 205 | struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ | 205 | struct inode *r_parent; /* parent dir inode */ |
| 206 | struct inode *r_target_inode; /* resulting inode */ | 206 | struct inode *r_target_inode; /* resulting inode */ |
| 207 | 207 | ||
| 208 | #define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ | ||
| 209 | #define CEPH_MDS_R_ABORTED (2) /* call was aborted */ | ||
| 210 | #define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */ | ||
| 211 | #define CEPH_MDS_R_GOT_SAFE (4) /* got a safe reply */ | ||
| 212 | #define CEPH_MDS_R_GOT_RESULT (5) /* got a result */ | ||
| 213 | #define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ | ||
| 214 | #define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ | ||
| 215 | unsigned long r_req_flags; | ||
| 216 | |||
| 208 | struct mutex r_fill_mutex; | 217 | struct mutex r_fill_mutex; |
| 209 | 218 | ||
| 210 | union ceph_mds_request_args r_args; | 219 | union ceph_mds_request_args r_args; |
| @@ -216,7 +225,6 @@ struct ceph_mds_request { | |||
| 216 | /* for choosing which mds to send this request to */ | 225 | /* for choosing which mds to send this request to */ |
| 217 | int r_direct_mode; | 226 | int r_direct_mode; |
| 218 | u32 r_direct_hash; /* choose dir frag based on this dentry hash */ | 227 | u32 r_direct_hash; /* choose dir frag based on this dentry hash */ |
| 219 | bool r_direct_is_hash; /* true if r_direct_hash is valid */ | ||
| 220 | 228 | ||
| 221 | /* data payload is used for xattr ops */ | 229 | /* data payload is used for xattr ops */ |
| 222 | struct ceph_pagelist *r_pagelist; | 230 | struct ceph_pagelist *r_pagelist; |
| @@ -234,7 +242,6 @@ struct ceph_mds_request { | |||
| 234 | struct ceph_mds_reply_info_parsed r_reply_info; | 242 | struct ceph_mds_reply_info_parsed r_reply_info; |
| 235 | struct page *r_locked_page; | 243 | struct page *r_locked_page; |
| 236 | int r_err; | 244 | int r_err; |
| 237 | bool r_aborted; | ||
| 238 | 245 | ||
| 239 | unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ | 246 | unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ |
| 240 | unsigned long r_started; /* start time to measure timeout against */ | 247 | unsigned long r_started; /* start time to measure timeout against */ |
| @@ -262,9 +269,7 @@ struct ceph_mds_request { | |||
| 262 | ceph_mds_request_callback_t r_callback; | 269 | ceph_mds_request_callback_t r_callback; |
| 263 | ceph_mds_request_wait_callback_t r_wait_for_completion; | 270 | ceph_mds_request_wait_callback_t r_wait_for_completion; |
| 264 | struct list_head r_unsafe_item; /* per-session unsafe list item */ | 271 | struct list_head r_unsafe_item; /* per-session unsafe list item */ |
| 265 | bool r_got_unsafe, r_got_safe, r_got_result; | ||
| 266 | 272 | ||
| 267 | bool r_did_prepopulate; | ||
| 268 | long long r_dir_release_cnt; | 273 | long long r_dir_release_cnt; |
| 269 | long long r_dir_ordered_cnt; | 274 | long long r_dir_ordered_cnt; |
| 270 | int r_readdir_cache_idx; | 275 | int r_readdir_cache_idx; |
diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 6bd20d707bfd..0ec8d0114e57 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c | |||
| @@ -757,7 +757,6 @@ static const struct super_operations ceph_super_ops = { | |||
| 757 | .destroy_inode = ceph_destroy_inode, | 757 | .destroy_inode = ceph_destroy_inode, |
| 758 | .write_inode = ceph_write_inode, | 758 | .write_inode = ceph_write_inode, |
| 759 | .drop_inode = ceph_drop_inode, | 759 | .drop_inode = ceph_drop_inode, |
| 760 | .evict_inode = ceph_evict_inode, | ||
| 761 | .sync_fs = ceph_sync_fs, | 760 | .sync_fs = ceph_sync_fs, |
| 762 | .put_super = ceph_put_super, | 761 | .put_super = ceph_put_super, |
| 763 | .show_options = ceph_show_options, | 762 | .show_options = ceph_show_options, |
| @@ -952,6 +951,14 @@ static int ceph_register_bdi(struct super_block *sb, | |||
| 952 | fsc->backing_dev_info.ra_pages = | 951 | fsc->backing_dev_info.ra_pages = |
| 953 | VM_MAX_READAHEAD * 1024 / PAGE_SIZE; | 952 | VM_MAX_READAHEAD * 1024 / PAGE_SIZE; |
| 954 | 953 | ||
| 954 | if (fsc->mount_options->rsize > fsc->mount_options->rasize && | ||
| 955 | fsc->mount_options->rsize >= PAGE_SIZE) | ||
| 956 | fsc->backing_dev_info.io_pages = | ||
| 957 | (fsc->mount_options->rsize + PAGE_SIZE - 1) | ||
| 958 | >> PAGE_SHIFT; | ||
| 959 | else if (fsc->mount_options->rsize == 0) | ||
| 960 | fsc->backing_dev_info.io_pages = ULONG_MAX; | ||
| 961 | |||
| 955 | err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", | 962 | err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", |
| 956 | atomic_long_inc_return(&bdi_seq)); | 963 | atomic_long_inc_return(&bdi_seq)); |
| 957 | if (!err) | 964 | if (!err) |
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3373b61faefd..e9410bcf4113 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
| @@ -45,8 +45,8 @@ | |||
| 45 | #define ceph_test_mount_opt(fsc, opt) \ | 45 | #define ceph_test_mount_opt(fsc, opt) \ |
| 46 | (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) | 46 | (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) |
| 47 | 47 | ||
| 48 | #define CEPH_RSIZE_DEFAULT 0 /* max read size */ | 48 | #define CEPH_RSIZE_DEFAULT (64*1024*1024) /* max read size */ |
| 49 | #define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */ | 49 | #define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ |
| 50 | #define CEPH_MAX_READDIR_DEFAULT 1024 | 50 | #define CEPH_MAX_READDIR_DEFAULT 1024 |
| 51 | #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) | 51 | #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) |
| 52 | #define CEPH_SNAPDIRNAME_DEFAULT ".snap" | 52 | #define CEPH_SNAPDIRNAME_DEFAULT ".snap" |
| @@ -343,7 +343,6 @@ struct ceph_inode_info { | |||
| 343 | u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ | 343 | u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ |
| 344 | u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ | 344 | u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ |
| 345 | 345 | ||
| 346 | struct list_head i_unsafe_writes; /* uncommitted sync writes */ | ||
| 347 | struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */ | 346 | struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */ |
| 348 | struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ | 347 | struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ |
| 349 | spinlock_t i_unsafe_lock; | 348 | spinlock_t i_unsafe_lock; |
| @@ -602,7 +601,7 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) | |||
| 602 | } | 601 | } |
| 603 | 602 | ||
| 604 | /* what the mds thinks we want */ | 603 | /* what the mds thinks we want */ |
| 605 | extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); | 604 | extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check); |
| 606 | 605 | ||
| 607 | extern void ceph_caps_init(struct ceph_mds_client *mdsc); | 606 | extern void ceph_caps_init(struct ceph_mds_client *mdsc); |
| 608 | extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); | 607 | extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); |
| @@ -753,7 +752,6 @@ extern const struct inode_operations ceph_file_iops; | |||
| 753 | extern struct inode *ceph_alloc_inode(struct super_block *sb); | 752 | extern struct inode *ceph_alloc_inode(struct super_block *sb); |
| 754 | extern void ceph_destroy_inode(struct inode *inode); | 753 | extern void ceph_destroy_inode(struct inode *inode); |
| 755 | extern int ceph_drop_inode(struct inode *inode); | 754 | extern int ceph_drop_inode(struct inode *inode); |
| 756 | extern void ceph_evict_inode(struct inode *inode); | ||
| 757 | 755 | ||
| 758 | extern struct inode *ceph_get_inode(struct super_block *sb, | 756 | extern struct inode *ceph_get_inode(struct super_block *sb, |
| 759 | struct ceph_vino vino); | 757 | struct ceph_vino vino); |
| @@ -764,8 +762,7 @@ extern void ceph_fill_file_time(struct inode *inode, int issued, | |||
| 764 | u64 time_warp_seq, struct timespec *ctime, | 762 | u64 time_warp_seq, struct timespec *ctime, |
| 765 | struct timespec *mtime, struct timespec *atime); | 763 | struct timespec *mtime, struct timespec *atime); |
| 766 | extern int ceph_fill_trace(struct super_block *sb, | 764 | extern int ceph_fill_trace(struct super_block *sb, |
| 767 | struct ceph_mds_request *req, | 765 | struct ceph_mds_request *req); |
| 768 | struct ceph_mds_session *session); | ||
| 769 | extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, | 766 | extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, |
| 770 | struct ceph_mds_session *session); | 767 | struct ceph_mds_session *session); |
| 771 | 768 | ||
| @@ -904,6 +901,7 @@ extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); | |||
| 904 | extern int ceph_encode_inode_release(void **p, struct inode *inode, | 901 | extern int ceph_encode_inode_release(void **p, struct inode *inode, |
| 905 | int mds, int drop, int unless, int force); | 902 | int mds, int drop, int unless, int force); |
| 906 | extern int ceph_encode_dentry_release(void **p, struct dentry *dn, | 903 | extern int ceph_encode_dentry_release(void **p, struct dentry *dn, |
| 904 | struct inode *dir, | ||
| 907 | int mds, int drop, int unless); | 905 | int mds, int drop, int unless); |
| 908 | 906 | ||
| 909 | extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, | 907 | extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, |
| @@ -933,7 +931,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, | |||
| 933 | extern int ceph_release(struct inode *inode, struct file *filp); | 931 | extern int ceph_release(struct inode *inode, struct file *filp); |
| 934 | extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, | 932 | extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, |
| 935 | char *data, size_t len); | 933 | char *data, size_t len); |
| 936 | extern void ceph_sync_write_wait(struct inode *inode); | 934 | |
| 937 | /* dir.c */ | 935 | /* dir.c */ |
| 938 | extern const struct file_operations ceph_dir_fops; | 936 | extern const struct file_operations ceph_dir_fops; |
| 939 | extern const struct file_operations ceph_snapdir_fops; | 937 | extern const struct file_operations ceph_snapdir_fops; |
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 03a6653d329a..2ea0c282f3dc 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h | |||
| @@ -22,7 +22,6 @@ struct ceph_osd_client; | |||
| 22 | * completion callback for async writepages | 22 | * completion callback for async writepages |
| 23 | */ | 23 | */ |
| 24 | typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); | 24 | typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); |
| 25 | typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); | ||
| 26 | 25 | ||
| 27 | #define CEPH_HOMELESS_OSD -1 | 26 | #define CEPH_HOMELESS_OSD -1 |
| 28 | 27 | ||
| @@ -170,15 +169,12 @@ struct ceph_osd_request { | |||
| 170 | unsigned int r_num_ops; | 169 | unsigned int r_num_ops; |
| 171 | 170 | ||
| 172 | int r_result; | 171 | int r_result; |
| 173 | bool r_got_reply; | ||
| 174 | 172 | ||
| 175 | struct ceph_osd_client *r_osdc; | 173 | struct ceph_osd_client *r_osdc; |
| 176 | struct kref r_kref; | 174 | struct kref r_kref; |
| 177 | bool r_mempool; | 175 | bool r_mempool; |
| 178 | struct completion r_completion; | 176 | struct completion r_completion; /* private to osd_client.c */ |
| 179 | struct completion r_done_completion; /* fsync waiter */ | ||
| 180 | ceph_osdc_callback_t r_callback; | 177 | ceph_osdc_callback_t r_callback; |
| 181 | ceph_osdc_unsafe_callback_t r_unsafe_callback; | ||
| 182 | struct list_head r_unsafe_item; | 178 | struct list_head r_unsafe_item; |
| 183 | 179 | ||
| 184 | struct inode *r_inode; /* for use by callbacks */ | 180 | struct inode *r_inode; /* for use by callbacks */ |
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 9a9041784dcf..938656f70807 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h | |||
| @@ -57,7 +57,7 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool) | |||
| 57 | case CEPH_POOL_TYPE_EC: | 57 | case CEPH_POOL_TYPE_EC: |
| 58 | return false; | 58 | return false; |
| 59 | default: | 59 | default: |
| 60 | BUG_ON(1); | 60 | BUG(); |
| 61 | } | 61 | } |
| 62 | } | 62 | } |
| 63 | 63 | ||
| @@ -82,13 +82,6 @@ void ceph_oloc_copy(struct ceph_object_locator *dest, | |||
| 82 | void ceph_oloc_destroy(struct ceph_object_locator *oloc); | 82 | void ceph_oloc_destroy(struct ceph_object_locator *oloc); |
| 83 | 83 | ||
| 84 | /* | 84 | /* |
| 85 | * Maximum supported by kernel client object name length | ||
| 86 | * | ||
| 87 | * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100) | ||
| 88 | */ | ||
| 89 | #define CEPH_MAX_OID_NAME_LEN 100 | ||
| 90 | |||
| 91 | /* | ||
| 92 | * 51-char inline_name is long enough for all cephfs and all but one | 85 | * 51-char inline_name is long enough for all cephfs and all but one |
| 93 | * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be | 86 | * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be |
| 94 | * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all | 87 | * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all |
| @@ -173,8 +166,8 @@ struct ceph_osdmap { | |||
| 173 | * the list of osds that store+replicate them. */ | 166 | * the list of osds that store+replicate them. */ |
| 174 | struct crush_map *crush; | 167 | struct crush_map *crush; |
| 175 | 168 | ||
| 176 | struct mutex crush_scratch_mutex; | 169 | struct mutex crush_workspace_mutex; |
| 177 | int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3]; | 170 | void *crush_workspace; |
| 178 | }; | 171 | }; |
| 179 | 172 | ||
| 180 | static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd) | 173 | static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd) |
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 5c0da61cb763..5d0018782d50 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h | |||
| @@ -50,7 +50,7 @@ struct ceph_timespec { | |||
| 50 | #define CEPH_PG_LAYOUT_LINEAR 2 | 50 | #define CEPH_PG_LAYOUT_LINEAR 2 |
| 51 | #define CEPH_PG_LAYOUT_HYBRID 3 | 51 | #define CEPH_PG_LAYOUT_HYBRID 3 |
| 52 | 52 | ||
| 53 | #define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */ | 53 | #define CEPH_PG_MAX_SIZE 32 /* max # osds in a single pg */ |
| 54 | 54 | ||
| 55 | /* | 55 | /* |
| 56 | * placement group. | 56 | * placement group. |
diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index be8f12b8f195..fbecbd089d75 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h | |||
| @@ -135,13 +135,6 @@ struct crush_bucket { | |||
| 135 | __u32 size; /* num items */ | 135 | __u32 size; /* num items */ |
| 136 | __s32 *items; | 136 | __s32 *items; |
| 137 | 137 | ||
| 138 | /* | ||
| 139 | * cached random permutation: used for uniform bucket and for | ||
| 140 | * the linear search fallback for the other bucket types. | ||
| 141 | */ | ||
| 142 | __u32 perm_x; /* @x for which *perm is defined */ | ||
| 143 | __u32 perm_n; /* num elements of *perm that are permuted/defined */ | ||
| 144 | __u32 *perm; | ||
| 145 | }; | 138 | }; |
| 146 | 139 | ||
| 147 | struct crush_bucket_uniform { | 140 | struct crush_bucket_uniform { |
| @@ -211,6 +204,21 @@ struct crush_map { | |||
| 211 | * device fails. */ | 204 | * device fails. */ |
| 212 | __u8 chooseleaf_stable; | 205 | __u8 chooseleaf_stable; |
| 213 | 206 | ||
| 207 | /* | ||
| 208 | * This value is calculated after decode or construction by | ||
| 209 | * the builder. It is exposed here (rather than having a | ||
| 210 | * 'build CRUSH working space' function) so that callers can | ||
| 211 | * reserve a static buffer, allocate space on the stack, or | ||
| 212 | * otherwise avoid calling into the heap allocator if they | ||
| 213 | * want to. The size of the working space depends on the map, | ||
| 214 | * while the size of the scratch vector passed to the mapper | ||
| 215 | * depends on the size of the desired result set. | ||
| 216 | * | ||
| 217 | * Nothing stops the caller from allocating both in one swell | ||
| 218 | * foop and passing in two points, though. | ||
| 219 | */ | ||
| 220 | size_t working_size; | ||
| 221 | |||
| 214 | #ifndef __KERNEL__ | 222 | #ifndef __KERNEL__ |
| 215 | /* | 223 | /* |
| 216 | * version 0 (original) of straw_calc has various flaws. version 1 | 224 | * version 0 (original) of straw_calc has various flaws. version 1 |
| @@ -248,4 +256,23 @@ static inline int crush_calc_tree_node(int i) | |||
| 248 | return ((i+1) << 1)-1; | 256 | return ((i+1) << 1)-1; |
| 249 | } | 257 | } |
| 250 | 258 | ||
| 259 | /* | ||
| 260 | * These data structures are private to the CRUSH implementation. They | ||
| 261 | * are exposed in this header file because builder needs their | ||
| 262 | * definitions to calculate the total working size. | ||
| 263 | * | ||
| 264 | * Moving this out of the crush map allow us to treat the CRUSH map as | ||
| 265 | * immutable within the mapper and removes the requirement for a CRUSH | ||
| 266 | * map lock. | ||
| 267 | */ | ||
| 268 | struct crush_work_bucket { | ||
| 269 | __u32 perm_x; /* @x for which *perm is defined */ | ||
| 270 | __u32 perm_n; /* num elements of *perm that are permuted/defined */ | ||
| 271 | __u32 *perm; /* Permutation of the bucket's items */ | ||
| 272 | }; | ||
| 273 | |||
| 274 | struct crush_work { | ||
| 275 | struct crush_work_bucket **work; /* Per-bucket working store */ | ||
| 276 | }; | ||
| 277 | |||
| 251 | #endif | 278 | #endif |
diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h index 5dfd5b1125d2..c95e19e1ff11 100644 --- a/include/linux/crush/mapper.h +++ b/include/linux/crush/mapper.h | |||
| @@ -15,6 +15,20 @@ extern int crush_do_rule(const struct crush_map *map, | |||
| 15 | int ruleno, | 15 | int ruleno, |
| 16 | int x, int *result, int result_max, | 16 | int x, int *result, int result_max, |
| 17 | const __u32 *weights, int weight_max, | 17 | const __u32 *weights, int weight_max, |
| 18 | int *scratch); | 18 | void *cwin); |
| 19 | |||
| 20 | /* | ||
| 21 | * Returns the exact amount of workspace that will need to be used | ||
| 22 | * for a given combination of crush_map and result_max. The caller can | ||
| 23 | * then allocate this much on its own, either on the stack, in a | ||
| 24 | * per-thread long-lived buffer, or however it likes. | ||
| 25 | */ | ||
| 26 | static inline size_t crush_work_size(const struct crush_map *map, | ||
| 27 | int result_max) | ||
| 28 | { | ||
| 29 | return map->working_size + result_max * 3 * sizeof(__u32); | ||
| 30 | } | ||
| 31 | |||
| 32 | void crush_init_workspace(const struct crush_map *map, void *v); | ||
| 19 | 33 | ||
| 20 | #endif | 34 | #endif |
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c index 50f040fdb2a9..b9233b990399 100644 --- a/net/ceph/cls_lock_client.c +++ b/net/ceph/cls_lock_client.c | |||
| @@ -69,8 +69,8 @@ int ceph_cls_lock(struct ceph_osd_client *osdc, | |||
| 69 | dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n", | 69 | dout("%s lock_name %s type %d cookie %s tag %s desc %s flags 0x%x\n", |
| 70 | __func__, lock_name, type, cookie, tag, desc, flags); | 70 | __func__, lock_name, type, cookie, tag, desc, flags); |
| 71 | ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock", | 71 | ret = ceph_osdc_call(osdc, oid, oloc, "lock", "lock", |
| 72 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | 72 | CEPH_OSD_FLAG_WRITE, lock_op_page, |
| 73 | lock_op_page, lock_op_buf_size, NULL, NULL); | 73 | lock_op_buf_size, NULL, NULL); |
| 74 | 74 | ||
| 75 | dout("%s: status %d\n", __func__, ret); | 75 | dout("%s: status %d\n", __func__, ret); |
| 76 | __free_page(lock_op_page); | 76 | __free_page(lock_op_page); |
| @@ -117,8 +117,8 @@ int ceph_cls_unlock(struct ceph_osd_client *osdc, | |||
| 117 | 117 | ||
| 118 | dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie); | 118 | dout("%s lock_name %s cookie %s\n", __func__, lock_name, cookie); |
| 119 | ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock", | 119 | ret = ceph_osdc_call(osdc, oid, oloc, "lock", "unlock", |
| 120 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | 120 | CEPH_OSD_FLAG_WRITE, unlock_op_page, |
| 121 | unlock_op_page, unlock_op_buf_size, NULL, NULL); | 121 | unlock_op_buf_size, NULL, NULL); |
| 122 | 122 | ||
| 123 | dout("%s: status %d\n", __func__, ret); | 123 | dout("%s: status %d\n", __func__, ret); |
| 124 | __free_page(unlock_op_page); | 124 | __free_page(unlock_op_page); |
| @@ -170,8 +170,8 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc, | |||
| 170 | dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name, | 170 | dout("%s lock_name %s cookie %s locker %s%llu\n", __func__, lock_name, |
| 171 | cookie, ENTITY_NAME(*locker)); | 171 | cookie, ENTITY_NAME(*locker)); |
| 172 | ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock", | 172 | ret = ceph_osdc_call(osdc, oid, oloc, "lock", "break_lock", |
| 173 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | 173 | CEPH_OSD_FLAG_WRITE, break_op_page, |
| 174 | break_op_page, break_op_buf_size, NULL, NULL); | 174 | break_op_buf_size, NULL, NULL); |
| 175 | 175 | ||
| 176 | dout("%s: status %d\n", __func__, ret); | 176 | dout("%s: status %d\n", __func__, ret); |
| 177 | __free_page(break_op_page); | 177 | __free_page(break_op_page); |
| @@ -278,7 +278,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc, | |||
| 278 | int get_info_op_buf_size; | 278 | int get_info_op_buf_size; |
| 279 | int name_len = strlen(lock_name); | 279 | int name_len = strlen(lock_name); |
| 280 | struct page *get_info_op_page, *reply_page; | 280 | struct page *get_info_op_page, *reply_page; |
| 281 | size_t reply_len; | 281 | size_t reply_len = PAGE_SIZE; |
| 282 | void *p, *end; | 282 | void *p, *end; |
| 283 | int ret; | 283 | int ret; |
| 284 | 284 | ||
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 80d7c3a97cb8..5bf94c04f645 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c | |||
| @@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) | |||
| 45 | 45 | ||
| 46 | void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) | 46 | void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) |
| 47 | { | 47 | { |
| 48 | kfree(b->h.perm); | ||
| 49 | kfree(b->h.items); | 48 | kfree(b->h.items); |
| 50 | kfree(b); | 49 | kfree(b); |
| 51 | } | 50 | } |
| @@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b) | |||
| 54 | { | 53 | { |
| 55 | kfree(b->item_weights); | 54 | kfree(b->item_weights); |
| 56 | kfree(b->sum_weights); | 55 | kfree(b->sum_weights); |
| 57 | kfree(b->h.perm); | ||
| 58 | kfree(b->h.items); | 56 | kfree(b->h.items); |
| 59 | kfree(b); | 57 | kfree(b); |
| 60 | } | 58 | } |
| 61 | 59 | ||
| 62 | void crush_destroy_bucket_tree(struct crush_bucket_tree *b) | 60 | void crush_destroy_bucket_tree(struct crush_bucket_tree *b) |
| 63 | { | 61 | { |
| 64 | kfree(b->h.perm); | ||
| 65 | kfree(b->h.items); | 62 | kfree(b->h.items); |
| 66 | kfree(b->node_weights); | 63 | kfree(b->node_weights); |
| 67 | kfree(b); | 64 | kfree(b); |
| @@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) | |||
| 71 | { | 68 | { |
| 72 | kfree(b->straws); | 69 | kfree(b->straws); |
| 73 | kfree(b->item_weights); | 70 | kfree(b->item_weights); |
| 74 | kfree(b->h.perm); | ||
| 75 | kfree(b->h.items); | 71 | kfree(b->h.items); |
| 76 | kfree(b); | 72 | kfree(b); |
| 77 | } | 73 | } |
| @@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b) | |||
| 79 | void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) | 75 | void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) |
| 80 | { | 76 | { |
| 81 | kfree(b->item_weights); | 77 | kfree(b->item_weights); |
| 82 | kfree(b->h.perm); | ||
| 83 | kfree(b->h.items); | 78 | kfree(b->h.items); |
| 84 | kfree(b); | 79 | kfree(b); |
| 85 | } | 80 | } |
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 130ab407c5ec..b5cd8c21bfdf 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c | |||
| @@ -54,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size | |||
| 54 | return -1; | 54 | return -1; |
| 55 | } | 55 | } |
| 56 | 56 | ||
| 57 | |||
| 58 | /* | 57 | /* |
| 59 | * bucket choose methods | 58 | * bucket choose methods |
| 60 | * | 59 | * |
| @@ -72,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size | |||
| 72 | * Since this is expensive, we optimize for the r=0 case, which | 71 | * Since this is expensive, we optimize for the r=0 case, which |
| 73 | * captures the vast majority of calls. | 72 | * captures the vast majority of calls. |
| 74 | */ | 73 | */ |
| 75 | static int bucket_perm_choose(struct crush_bucket *bucket, | 74 | static int bucket_perm_choose(const struct crush_bucket *bucket, |
| 75 | struct crush_work_bucket *work, | ||
| 76 | int x, int r) | 76 | int x, int r) |
| 77 | { | 77 | { |
| 78 | unsigned int pr = r % bucket->size; | 78 | unsigned int pr = r % bucket->size; |
| 79 | unsigned int i, s; | 79 | unsigned int i, s; |
| 80 | 80 | ||
| 81 | /* start a new permutation if @x has changed */ | 81 | /* start a new permutation if @x has changed */ |
| 82 | if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { | 82 | if (work->perm_x != (__u32)x || work->perm_n == 0) { |
| 83 | dprintk("bucket %d new x=%d\n", bucket->id, x); | 83 | dprintk("bucket %d new x=%d\n", bucket->id, x); |
| 84 | bucket->perm_x = x; | 84 | work->perm_x = x; |
| 85 | 85 | ||
| 86 | /* optimize common r=0 case */ | 86 | /* optimize common r=0 case */ |
| 87 | if (pr == 0) { | 87 | if (pr == 0) { |
| 88 | s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % | 88 | s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % |
| 89 | bucket->size; | 89 | bucket->size; |
| 90 | bucket->perm[0] = s; | 90 | work->perm[0] = s; |
| 91 | bucket->perm_n = 0xffff; /* magic value, see below */ | 91 | work->perm_n = 0xffff; /* magic value, see below */ |
| 92 | goto out; | 92 | goto out; |
| 93 | } | 93 | } |
| 94 | 94 | ||
| 95 | for (i = 0; i < bucket->size; i++) | 95 | for (i = 0; i < bucket->size; i++) |
| 96 | bucket->perm[i] = i; | 96 | work->perm[i] = i; |
| 97 | bucket->perm_n = 0; | 97 | work->perm_n = 0; |
| 98 | } else if (bucket->perm_n == 0xffff) { | 98 | } else if (work->perm_n == 0xffff) { |
| 99 | /* clean up after the r=0 case above */ | 99 | /* clean up after the r=0 case above */ |
| 100 | for (i = 1; i < bucket->size; i++) | 100 | for (i = 1; i < bucket->size; i++) |
| 101 | bucket->perm[i] = i; | 101 | work->perm[i] = i; |
| 102 | bucket->perm[bucket->perm[0]] = 0; | 102 | work->perm[work->perm[0]] = 0; |
| 103 | bucket->perm_n = 1; | 103 | work->perm_n = 1; |
| 104 | } | 104 | } |
| 105 | 105 | ||
| 106 | /* calculate permutation up to pr */ | 106 | /* calculate permutation up to pr */ |
| 107 | for (i = 0; i < bucket->perm_n; i++) | 107 | for (i = 0; i < work->perm_n; i++) |
| 108 | dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); | 108 | dprintk(" perm_choose have %d: %d\n", i, work->perm[i]); |
| 109 | while (bucket->perm_n <= pr) { | 109 | while (work->perm_n <= pr) { |
| 110 | unsigned int p = bucket->perm_n; | 110 | unsigned int p = work->perm_n; |
| 111 | /* no point in swapping the final entry */ | 111 | /* no point in swapping the final entry */ |
| 112 | if (p < bucket->size - 1) { | 112 | if (p < bucket->size - 1) { |
| 113 | i = crush_hash32_3(bucket->hash, x, bucket->id, p) % | 113 | i = crush_hash32_3(bucket->hash, x, bucket->id, p) % |
| 114 | (bucket->size - p); | 114 | (bucket->size - p); |
| 115 | if (i) { | 115 | if (i) { |
| 116 | unsigned int t = bucket->perm[p + i]; | 116 | unsigned int t = work->perm[p + i]; |
| 117 | bucket->perm[p + i] = bucket->perm[p]; | 117 | work->perm[p + i] = work->perm[p]; |
| 118 | bucket->perm[p] = t; | 118 | work->perm[p] = t; |
| 119 | } | 119 | } |
| 120 | dprintk(" perm_choose swap %d with %d\n", p, p+i); | 120 | dprintk(" perm_choose swap %d with %d\n", p, p+i); |
| 121 | } | 121 | } |
| 122 | bucket->perm_n++; | 122 | work->perm_n++; |
| 123 | } | 123 | } |
| 124 | for (i = 0; i < bucket->size; i++) | 124 | for (i = 0; i < bucket->size; i++) |
| 125 | dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); | 125 | dprintk(" perm_choose %d: %d\n", i, work->perm[i]); |
| 126 | 126 | ||
| 127 | s = bucket->perm[pr]; | 127 | s = work->perm[pr]; |
| 128 | out: | 128 | out: |
| 129 | dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, | 129 | dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, |
| 130 | bucket->size, x, r, pr, s); | 130 | bucket->size, x, r, pr, s); |
| @@ -132,14 +132,14 @@ out: | |||
| 132 | } | 132 | } |
| 133 | 133 | ||
| 134 | /* uniform */ | 134 | /* uniform */ |
| 135 | static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, | 135 | static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket, |
| 136 | int x, int r) | 136 | struct crush_work_bucket *work, int x, int r) |
| 137 | { | 137 | { |
| 138 | return bucket_perm_choose(&bucket->h, x, r); | 138 | return bucket_perm_choose(&bucket->h, work, x, r); |
| 139 | } | 139 | } |
| 140 | 140 | ||
| 141 | /* list */ | 141 | /* list */ |
| 142 | static int bucket_list_choose(struct crush_bucket_list *bucket, | 142 | static int bucket_list_choose(const struct crush_bucket_list *bucket, |
| 143 | int x, int r) | 143 | int x, int r) |
| 144 | { | 144 | { |
| 145 | int i; | 145 | int i; |
| @@ -155,8 +155,9 @@ static int bucket_list_choose(struct crush_bucket_list *bucket, | |||
| 155 | w *= bucket->sum_weights[i]; | 155 | w *= bucket->sum_weights[i]; |
| 156 | w = w >> 16; | 156 | w = w >> 16; |
| 157 | /*dprintk(" scaled %llx\n", w);*/ | 157 | /*dprintk(" scaled %llx\n", w);*/ |
| 158 | if (w < bucket->item_weights[i]) | 158 | if (w < bucket->item_weights[i]) { |
| 159 | return bucket->h.items[i]; | 159 | return bucket->h.items[i]; |
| 160 | } | ||
| 160 | } | 161 | } |
| 161 | 162 | ||
| 162 | dprintk("bad list sums for bucket %d\n", bucket->h.id); | 163 | dprintk("bad list sums for bucket %d\n", bucket->h.id); |
| @@ -192,7 +193,7 @@ static int terminal(int x) | |||
| 192 | return x & 1; | 193 | return x & 1; |
| 193 | } | 194 | } |
| 194 | 195 | ||
| 195 | static int bucket_tree_choose(struct crush_bucket_tree *bucket, | 196 | static int bucket_tree_choose(const struct crush_bucket_tree *bucket, |
| 196 | int x, int r) | 197 | int x, int r) |
| 197 | { | 198 | { |
| 198 | int n; | 199 | int n; |
| @@ -224,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket, | |||
| 224 | 225 | ||
| 225 | /* straw */ | 226 | /* straw */ |
| 226 | 227 | ||
| 227 | static int bucket_straw_choose(struct crush_bucket_straw *bucket, | 228 | static int bucket_straw_choose(const struct crush_bucket_straw *bucket, |
| 228 | int x, int r) | 229 | int x, int r) |
| 229 | { | 230 | { |
| 230 | __u32 i; | 231 | __u32 i; |
| @@ -301,7 +302,7 @@ static __u64 crush_ln(unsigned int xin) | |||
| 301 | * | 302 | * |
| 302 | */ | 303 | */ |
| 303 | 304 | ||
| 304 | static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, | 305 | static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, |
| 305 | int x, int r) | 306 | int x, int r) |
| 306 | { | 307 | { |
| 307 | unsigned int i, high = 0; | 308 | unsigned int i, high = 0; |
| @@ -344,37 +345,42 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, | |||
| 344 | high_draw = draw; | 345 | high_draw = draw; |
| 345 | } | 346 | } |
| 346 | } | 347 | } |
| 348 | |||
| 347 | return bucket->h.items[high]; | 349 | return bucket->h.items[high]; |
| 348 | } | 350 | } |
| 349 | 351 | ||
| 350 | 352 | ||
| 351 | static int crush_bucket_choose(struct crush_bucket *in, int x, int r) | 353 | static int crush_bucket_choose(const struct crush_bucket *in, |
| 354 | struct crush_work_bucket *work, | ||
| 355 | int x, int r) | ||
| 352 | { | 356 | { |
| 353 | dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); | 357 | dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); |
| 354 | BUG_ON(in->size == 0); | 358 | BUG_ON(in->size == 0); |
| 355 | switch (in->alg) { | 359 | switch (in->alg) { |
| 356 | case CRUSH_BUCKET_UNIFORM: | 360 | case CRUSH_BUCKET_UNIFORM: |
| 357 | return bucket_uniform_choose((struct crush_bucket_uniform *)in, | 361 | return bucket_uniform_choose( |
| 358 | x, r); | 362 | (const struct crush_bucket_uniform *)in, |
| 363 | work, x, r); | ||
| 359 | case CRUSH_BUCKET_LIST: | 364 | case CRUSH_BUCKET_LIST: |
| 360 | return bucket_list_choose((struct crush_bucket_list *)in, | 365 | return bucket_list_choose((const struct crush_bucket_list *)in, |
| 361 | x, r); | 366 | x, r); |
| 362 | case CRUSH_BUCKET_TREE: | 367 | case CRUSH_BUCKET_TREE: |
| 363 | return bucket_tree_choose((struct crush_bucket_tree *)in, | 368 | return bucket_tree_choose((const struct crush_bucket_tree *)in, |
| 364 | x, r); | 369 | x, r); |
| 365 | case CRUSH_BUCKET_STRAW: | 370 | case CRUSH_BUCKET_STRAW: |
| 366 | return bucket_straw_choose((struct crush_bucket_straw *)in, | 371 | return bucket_straw_choose( |
| 367 | x, r); | 372 | (const struct crush_bucket_straw *)in, |
| 373 | x, r); | ||
| 368 | case CRUSH_BUCKET_STRAW2: | 374 | case CRUSH_BUCKET_STRAW2: |
| 369 | return bucket_straw2_choose((struct crush_bucket_straw2 *)in, | 375 | return bucket_straw2_choose( |
| 370 | x, r); | 376 | (const struct crush_bucket_straw2 *)in, |
| 377 | x, r); | ||
| 371 | default: | 378 | default: |
| 372 | dprintk("unknown bucket %d alg %d\n", in->id, in->alg); | 379 | dprintk("unknown bucket %d alg %d\n", in->id, in->alg); |
| 373 | return in->items[0]; | 380 | return in->items[0]; |
| 374 | } | 381 | } |
| 375 | } | 382 | } |
| 376 | 383 | ||
| 377 | |||
| 378 | /* | 384 | /* |
| 379 | * true if device is marked "out" (failed, fully offloaded) | 385 | * true if device is marked "out" (failed, fully offloaded) |
| 380 | * of the cluster | 386 | * of the cluster |
| @@ -416,7 +422,8 @@ static int is_out(const struct crush_map *map, | |||
| 416 | * @parent_r: r value passed from the parent | 422 | * @parent_r: r value passed from the parent |
| 417 | */ | 423 | */ |
| 418 | static int crush_choose_firstn(const struct crush_map *map, | 424 | static int crush_choose_firstn(const struct crush_map *map, |
| 419 | struct crush_bucket *bucket, | 425 | struct crush_work *work, |
| 426 | const struct crush_bucket *bucket, | ||
| 420 | const __u32 *weight, int weight_max, | 427 | const __u32 *weight, int weight_max, |
| 421 | int x, int numrep, int type, | 428 | int x, int numrep, int type, |
| 422 | int *out, int outpos, | 429 | int *out, int outpos, |
| @@ -434,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map, | |||
| 434 | int rep; | 441 | int rep; |
| 435 | unsigned int ftotal, flocal; | 442 | unsigned int ftotal, flocal; |
| 436 | int retry_descent, retry_bucket, skip_rep; | 443 | int retry_descent, retry_bucket, skip_rep; |
| 437 | struct crush_bucket *in = bucket; | 444 | const struct crush_bucket *in = bucket; |
| 438 | int r; | 445 | int r; |
| 439 | int i; | 446 | int i; |
| 440 | int item = 0; | 447 | int item = 0; |
| @@ -473,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map, | |||
| 473 | if (local_fallback_retries > 0 && | 480 | if (local_fallback_retries > 0 && |
| 474 | flocal >= (in->size>>1) && | 481 | flocal >= (in->size>>1) && |
| 475 | flocal > local_fallback_retries) | 482 | flocal > local_fallback_retries) |
| 476 | item = bucket_perm_choose(in, x, r); | 483 | item = bucket_perm_choose( |
| 484 | in, work->work[-1-in->id], | ||
| 485 | x, r); | ||
| 477 | else | 486 | else |
| 478 | item = crush_bucket_choose(in, x, r); | 487 | item = crush_bucket_choose( |
| 488 | in, work->work[-1-in->id], | ||
| 489 | x, r); | ||
| 479 | if (item >= map->max_devices) { | 490 | if (item >= map->max_devices) { |
| 480 | dprintk(" bad item %d\n", item); | 491 | dprintk(" bad item %d\n", item); |
| 481 | skip_rep = 1; | 492 | skip_rep = 1; |
| @@ -518,19 +529,21 @@ static int crush_choose_firstn(const struct crush_map *map, | |||
| 518 | sub_r = r >> (vary_r-1); | 529 | sub_r = r >> (vary_r-1); |
| 519 | else | 530 | else |
| 520 | sub_r = 0; | 531 | sub_r = 0; |
| 521 | if (crush_choose_firstn(map, | 532 | if (crush_choose_firstn( |
| 522 | map->buckets[-1-item], | 533 | map, |
| 523 | weight, weight_max, | 534 | work, |
| 524 | x, stable ? 1 : outpos+1, 0, | 535 | map->buckets[-1-item], |
| 525 | out2, outpos, count, | 536 | weight, weight_max, |
| 526 | recurse_tries, 0, | 537 | x, stable ? 1 : outpos+1, 0, |
| 527 | local_retries, | 538 | out2, outpos, count, |
| 528 | local_fallback_retries, | 539 | recurse_tries, 0, |
| 529 | 0, | 540 | local_retries, |
| 530 | vary_r, | 541 | local_fallback_retries, |
| 531 | stable, | 542 | 0, |
| 532 | NULL, | 543 | vary_r, |
| 533 | sub_r) <= outpos) | 544 | stable, |
| 545 | NULL, | ||
| 546 | sub_r) <= outpos) | ||
| 534 | /* didn't get leaf */ | 547 | /* didn't get leaf */ |
| 535 | reject = 1; | 548 | reject = 1; |
| 536 | } else { | 549 | } else { |
| @@ -539,14 +552,12 @@ static int crush_choose_firstn(const struct crush_map *map, | |||
| 539 | } | 552 | } |
| 540 | } | 553 | } |
| 541 | 554 | ||
| 542 | if (!reject) { | 555 | if (!reject && !collide) { |
| 543 | /* out? */ | 556 | /* out? */ |
| 544 | if (itemtype == 0) | 557 | if (itemtype == 0) |
| 545 | reject = is_out(map, weight, | 558 | reject = is_out(map, weight, |
| 546 | weight_max, | 559 | weight_max, |
| 547 | item, x); | 560 | item, x); |
| 548 | else | ||
| 549 | reject = 0; | ||
| 550 | } | 561 | } |
| 551 | 562 | ||
| 552 | reject: | 563 | reject: |
| @@ -600,7 +611,8 @@ reject: | |||
| 600 | * | 611 | * |
| 601 | */ | 612 | */ |
| 602 | static void crush_choose_indep(const struct crush_map *map, | 613 | static void crush_choose_indep(const struct crush_map *map, |
| 603 | struct crush_bucket *bucket, | 614 | struct crush_work *work, |
| 615 | const struct crush_bucket *bucket, | ||
| 604 | const __u32 *weight, int weight_max, | 616 | const __u32 *weight, int weight_max, |
| 605 | int x, int left, int numrep, int type, | 617 | int x, int left, int numrep, int type, |
| 606 | int *out, int outpos, | 618 | int *out, int outpos, |
| @@ -610,7 +622,7 @@ static void crush_choose_indep(const struct crush_map *map, | |||
| 610 | int *out2, | 622 | int *out2, |
| 611 | int parent_r) | 623 | int parent_r) |
| 612 | { | 624 | { |
| 613 | struct crush_bucket *in = bucket; | 625 | const struct crush_bucket *in = bucket; |
| 614 | int endpos = outpos + left; | 626 | int endpos = outpos + left; |
| 615 | int rep; | 627 | int rep; |
| 616 | unsigned int ftotal; | 628 | unsigned int ftotal; |
| @@ -678,7 +690,9 @@ static void crush_choose_indep(const struct crush_map *map, | |||
| 678 | break; | 690 | break; |
| 679 | } | 691 | } |
| 680 | 692 | ||
| 681 | item = crush_bucket_choose(in, x, r); | 693 | item = crush_bucket_choose( |
| 694 | in, work->work[-1-in->id], | ||
| 695 | x, r); | ||
| 682 | if (item >= map->max_devices) { | 696 | if (item >= map->max_devices) { |
| 683 | dprintk(" bad item %d\n", item); | 697 | dprintk(" bad item %d\n", item); |
| 684 | out[rep] = CRUSH_ITEM_NONE; | 698 | out[rep] = CRUSH_ITEM_NONE; |
| @@ -724,13 +738,15 @@ static void crush_choose_indep(const struct crush_map *map, | |||
| 724 | 738 | ||
| 725 | if (recurse_to_leaf) { | 739 | if (recurse_to_leaf) { |
| 726 | if (item < 0) { | 740 | if (item < 0) { |
| 727 | crush_choose_indep(map, | 741 | crush_choose_indep( |
| 728 | map->buckets[-1-item], | 742 | map, |
| 729 | weight, weight_max, | 743 | work, |
| 730 | x, 1, numrep, 0, | 744 | map->buckets[-1-item], |
| 731 | out2, rep, | 745 | weight, weight_max, |
| 732 | recurse_tries, 0, | 746 | x, 1, numrep, 0, |
| 733 | 0, NULL, r); | 747 | out2, rep, |
| 748 | recurse_tries, 0, | ||
| 749 | 0, NULL, r); | ||
| 734 | if (out2[rep] == CRUSH_ITEM_NONE) { | 750 | if (out2[rep] == CRUSH_ITEM_NONE) { |
| 735 | /* placed nothing; no leaf */ | 751 | /* placed nothing; no leaf */ |
| 736 | break; | 752 | break; |
| @@ -781,6 +797,53 @@ static void crush_choose_indep(const struct crush_map *map, | |||
| 781 | #endif | 797 | #endif |
| 782 | } | 798 | } |
| 783 | 799 | ||
| 800 | |||
| 801 | /* | ||
| 802 | * This takes a chunk of memory and sets it up to be a shiny new | ||
| 803 | * working area for a CRUSH placement computation. It must be called | ||
| 804 | * on any newly allocated memory before passing it in to | ||
| 805 | * crush_do_rule. It may be used repeatedly after that, so long as the | ||
| 806 | * map has not changed. If the map /has/ changed, you must make sure | ||
| 807 | * the working size is no smaller than what was allocated and re-run | ||
| 808 | * crush_init_workspace. | ||
| 809 | * | ||
| 810 | * If you do retain the working space between calls to crush, make it | ||
| 811 | * thread-local. | ||
| 812 | */ | ||
| 813 | void crush_init_workspace(const struct crush_map *map, void *v) | ||
| 814 | { | ||
| 815 | struct crush_work *w = v; | ||
| 816 | __s32 b; | ||
| 817 | |||
| 818 | /* | ||
| 819 | * We work by moving through the available space and setting | ||
| 820 | * values and pointers as we go. | ||
| 821 | * | ||
| 822 | * It's a bit like Forth's use of the 'allot' word since we | ||
| 823 | * set the pointer first and then reserve the space for it to | ||
| 824 | * point to by incrementing the point. | ||
| 825 | */ | ||
| 826 | v += sizeof(struct crush_work *); | ||
| 827 | w->work = v; | ||
| 828 | v += map->max_buckets * sizeof(struct crush_work_bucket *); | ||
| 829 | for (b = 0; b < map->max_buckets; ++b) { | ||
| 830 | if (!map->buckets[b]) | ||
| 831 | continue; | ||
| 832 | |||
| 833 | w->work[b] = v; | ||
| 834 | switch (map->buckets[b]->alg) { | ||
| 835 | default: | ||
| 836 | v += sizeof(struct crush_work_bucket); | ||
| 837 | break; | ||
| 838 | } | ||
| 839 | w->work[b]->perm_x = 0; | ||
| 840 | w->work[b]->perm_n = 0; | ||
| 841 | w->work[b]->perm = v; | ||
| 842 | v += map->buckets[b]->size * sizeof(__u32); | ||
| 843 | } | ||
| 844 | BUG_ON(v - (void *)w != map->working_size); | ||
| 845 | } | ||
| 846 | |||
| 784 | /** | 847 | /** |
| 785 | * crush_do_rule - calculate a mapping with the given input and rule | 848 | * crush_do_rule - calculate a mapping with the given input and rule |
| 786 | * @map: the crush_map | 849 | * @map: the crush_map |
| @@ -790,24 +853,25 @@ static void crush_choose_indep(const struct crush_map *map, | |||
| 790 | * @result_max: maximum result size | 853 | * @result_max: maximum result size |
| 791 | * @weight: weight vector (for map leaves) | 854 | * @weight: weight vector (for map leaves) |
| 792 | * @weight_max: size of weight vector | 855 | * @weight_max: size of weight vector |
| 793 | * @scratch: scratch vector for private use; must be >= 3 * result_max | 856 | * @cwin: pointer to at least crush_work_size() bytes of memory |
| 794 | */ | 857 | */ |
| 795 | int crush_do_rule(const struct crush_map *map, | 858 | int crush_do_rule(const struct crush_map *map, |
| 796 | int ruleno, int x, int *result, int result_max, | 859 | int ruleno, int x, int *result, int result_max, |
| 797 | const __u32 *weight, int weight_max, | 860 | const __u32 *weight, int weight_max, |
| 798 | int *scratch) | 861 | void *cwin) |
| 799 | { | 862 | { |
| 800 | int result_len; | 863 | int result_len; |
| 801 | int *a = scratch; | 864 | struct crush_work *cw = cwin; |
| 802 | int *b = scratch + result_max; | 865 | int *a = cwin + map->working_size; |
| 803 | int *c = scratch + result_max*2; | 866 | int *b = a + result_max; |
| 867 | int *c = b + result_max; | ||
| 868 | int *w = a; | ||
| 869 | int *o = b; | ||
| 804 | int recurse_to_leaf; | 870 | int recurse_to_leaf; |
| 805 | int *w; | ||
| 806 | int wsize = 0; | 871 | int wsize = 0; |
| 807 | int *o; | ||
| 808 | int osize; | 872 | int osize; |
| 809 | int *tmp; | 873 | int *tmp; |
| 810 | struct crush_rule *rule; | 874 | const struct crush_rule *rule; |
| 811 | __u32 step; | 875 | __u32 step; |
| 812 | int i, j; | 876 | int i, j; |
| 813 | int numrep; | 877 | int numrep; |
| @@ -835,12 +899,10 @@ int crush_do_rule(const struct crush_map *map, | |||
| 835 | 899 | ||
| 836 | rule = map->rules[ruleno]; | 900 | rule = map->rules[ruleno]; |
| 837 | result_len = 0; | 901 | result_len = 0; |
| 838 | w = a; | ||
| 839 | o = b; | ||
| 840 | 902 | ||
| 841 | for (step = 0; step < rule->len; step++) { | 903 | for (step = 0; step < rule->len; step++) { |
| 842 | int firstn = 0; | 904 | int firstn = 0; |
| 843 | struct crush_rule_step *curstep = &rule->steps[step]; | 905 | const struct crush_rule_step *curstep = &rule->steps[step]; |
| 844 | 906 | ||
| 845 | switch (curstep->op) { | 907 | switch (curstep->op) { |
| 846 | case CRUSH_RULE_TAKE: | 908 | case CRUSH_RULE_TAKE: |
| @@ -936,6 +998,7 @@ int crush_do_rule(const struct crush_map *map, | |||
| 936 | recurse_tries = choose_tries; | 998 | recurse_tries = choose_tries; |
| 937 | osize += crush_choose_firstn( | 999 | osize += crush_choose_firstn( |
| 938 | map, | 1000 | map, |
| 1001 | cw, | ||
| 939 | map->buckets[bno], | 1002 | map->buckets[bno], |
| 940 | weight, weight_max, | 1003 | weight, weight_max, |
| 941 | x, numrep, | 1004 | x, numrep, |
| @@ -956,6 +1019,7 @@ int crush_do_rule(const struct crush_map *map, | |||
| 956 | numrep : (result_max-osize)); | 1019 | numrep : (result_max-osize)); |
| 957 | crush_choose_indep( | 1020 | crush_choose_indep( |
| 958 | map, | 1021 | map, |
| 1022 | cw, | ||
| 959 | map->buckets[bno], | 1023 | map->buckets[bno], |
| 960 | weight, weight_max, | 1024 | weight, weight_max, |
| 961 | x, out_size, numrep, | 1025 | x, out_size, numrep, |
| @@ -997,5 +1061,6 @@ int crush_do_rule(const struct crush_map *map, | |||
| 997 | break; | 1061 | break; |
| 998 | } | 1062 | } |
| 999 | } | 1063 | } |
| 1064 | |||
| 1000 | return result_len; | 1065 | return result_len; |
| 1001 | } | 1066 | } |
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 292e33bd916e..85747b7f91a9 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c | |||
| @@ -3,6 +3,7 @@ | |||
| 3 | 3 | ||
| 4 | #include <linux/err.h> | 4 | #include <linux/err.h> |
| 5 | #include <linux/scatterlist.h> | 5 | #include <linux/scatterlist.h> |
| 6 | #include <linux/sched.h> | ||
| 6 | #include <linux/slab.h> | 7 | #include <linux/slab.h> |
| 7 | #include <crypto/aes.h> | 8 | #include <crypto/aes.h> |
| 8 | #include <crypto/skcipher.h> | 9 | #include <crypto/skcipher.h> |
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f3378ba1a828..b65bbf9f45eb 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c | |||
| @@ -460,7 +460,6 @@ static void request_init(struct ceph_osd_request *req) | |||
| 460 | 460 | ||
| 461 | kref_init(&req->r_kref); | 461 | kref_init(&req->r_kref); |
| 462 | init_completion(&req->r_completion); | 462 | init_completion(&req->r_completion); |
| 463 | init_completion(&req->r_done_completion); | ||
| 464 | RB_CLEAR_NODE(&req->r_node); | 463 | RB_CLEAR_NODE(&req->r_node); |
| 465 | RB_CLEAR_NODE(&req->r_mc_node); | 464 | RB_CLEAR_NODE(&req->r_mc_node); |
| 466 | INIT_LIST_HEAD(&req->r_unsafe_item); | 465 | INIT_LIST_HEAD(&req->r_unsafe_item); |
| @@ -672,7 +671,8 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req, | |||
| 672 | BUG_ON(length > previous); | 671 | BUG_ON(length > previous); |
| 673 | 672 | ||
| 674 | op->extent.length = length; | 673 | op->extent.length = length; |
| 675 | op->indata_len -= previous - length; | 674 | if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL) |
| 675 | op->indata_len -= previous - length; | ||
| 676 | } | 676 | } |
| 677 | EXPORT_SYMBOL(osd_req_op_extent_update); | 677 | EXPORT_SYMBOL(osd_req_op_extent_update); |
| 678 | 678 | ||
| @@ -1636,7 +1636,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) | |||
| 1636 | bool need_send = false; | 1636 | bool need_send = false; |
| 1637 | bool promoted = false; | 1637 | bool promoted = false; |
| 1638 | 1638 | ||
| 1639 | WARN_ON(req->r_tid || req->r_got_reply); | 1639 | WARN_ON(req->r_tid); |
| 1640 | dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); | 1640 | dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); |
| 1641 | 1641 | ||
| 1642 | again: | 1642 | again: |
| @@ -1704,17 +1704,10 @@ promote: | |||
| 1704 | 1704 | ||
| 1705 | static void account_request(struct ceph_osd_request *req) | 1705 | static void account_request(struct ceph_osd_request *req) |
| 1706 | { | 1706 | { |
| 1707 | unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; | 1707 | WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK)); |
| 1708 | WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))); | ||
| 1708 | 1709 | ||
| 1709 | if (req->r_flags & CEPH_OSD_FLAG_READ) { | 1710 | req->r_flags |= CEPH_OSD_FLAG_ONDISK; |
| 1710 | WARN_ON(req->r_flags & mask); | ||
| 1711 | req->r_flags |= CEPH_OSD_FLAG_ACK; | ||
| 1712 | } else if (req->r_flags & CEPH_OSD_FLAG_WRITE) | ||
| 1713 | WARN_ON(!(req->r_flags & mask)); | ||
| 1714 | else | ||
| 1715 | WARN_ON(1); | ||
| 1716 | |||
| 1717 | WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask); | ||
| 1718 | atomic_inc(&req->r_osdc->num_requests); | 1711 | atomic_inc(&req->r_osdc->num_requests); |
| 1719 | } | 1712 | } |
| 1720 | 1713 | ||
| @@ -1749,15 +1742,15 @@ static void finish_request(struct ceph_osd_request *req) | |||
| 1749 | 1742 | ||
| 1750 | static void __complete_request(struct ceph_osd_request *req) | 1743 | static void __complete_request(struct ceph_osd_request *req) |
| 1751 | { | 1744 | { |
| 1752 | if (req->r_callback) | 1745 | if (req->r_callback) { |
| 1746 | dout("%s req %p tid %llu cb %pf result %d\n", __func__, req, | ||
| 1747 | req->r_tid, req->r_callback, req->r_result); | ||
| 1753 | req->r_callback(req); | 1748 | req->r_callback(req); |
| 1754 | else | 1749 | } |
| 1755 | complete_all(&req->r_completion); | ||
| 1756 | } | 1750 | } |
| 1757 | 1751 | ||
| 1758 | /* | 1752 | /* |
| 1759 | * Note that this is open-coded in handle_reply(), which has to deal | 1753 | * This is open-coded in handle_reply(). |
| 1760 | * with ack vs commit, dup acks, etc. | ||
| 1761 | */ | 1754 | */ |
| 1762 | static void complete_request(struct ceph_osd_request *req, int err) | 1755 | static void complete_request(struct ceph_osd_request *req, int err) |
| 1763 | { | 1756 | { |
| @@ -1766,7 +1759,7 @@ static void complete_request(struct ceph_osd_request *req, int err) | |||
| 1766 | req->r_result = err; | 1759 | req->r_result = err; |
| 1767 | finish_request(req); | 1760 | finish_request(req); |
| 1768 | __complete_request(req); | 1761 | __complete_request(req); |
| 1769 | complete_all(&req->r_done_completion); | 1762 | complete_all(&req->r_completion); |
| 1770 | ceph_osdc_put_request(req); | 1763 | ceph_osdc_put_request(req); |
| 1771 | } | 1764 | } |
| 1772 | 1765 | ||
| @@ -1792,7 +1785,7 @@ static void cancel_request(struct ceph_osd_request *req) | |||
| 1792 | 1785 | ||
| 1793 | cancel_map_check(req); | 1786 | cancel_map_check(req); |
| 1794 | finish_request(req); | 1787 | finish_request(req); |
| 1795 | complete_all(&req->r_done_completion); | 1788 | complete_all(&req->r_completion); |
| 1796 | ceph_osdc_put_request(req); | 1789 | ceph_osdc_put_request(req); |
| 1797 | } | 1790 | } |
| 1798 | 1791 | ||
| @@ -2169,7 +2162,6 @@ static void linger_commit_cb(struct ceph_osd_request *req) | |||
| 2169 | mutex_lock(&lreq->lock); | 2162 | mutex_lock(&lreq->lock); |
| 2170 | dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, | 2163 | dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, |
| 2171 | lreq->linger_id, req->r_result); | 2164 | lreq->linger_id, req->r_result); |
| 2172 | WARN_ON(!__linger_registered(lreq)); | ||
| 2173 | linger_reg_commit_complete(lreq, req->r_result); | 2165 | linger_reg_commit_complete(lreq, req->r_result); |
| 2174 | lreq->committed = true; | 2166 | lreq->committed = true; |
| 2175 | 2167 | ||
| @@ -2785,31 +2777,8 @@ e_inval: | |||
| 2785 | } | 2777 | } |
| 2786 | 2778 | ||
| 2787 | /* | 2779 | /* |
| 2788 | * We are done with @req if | 2780 | * Handle MOSDOpReply. Set ->r_result and call the callback if it is |
| 2789 | * - @m is a safe reply, or | 2781 | * specified. |
| 2790 | * - @m is an unsafe reply and we didn't want a safe one | ||
| 2791 | */ | ||
| 2792 | static bool done_request(const struct ceph_osd_request *req, | ||
| 2793 | const struct MOSDOpReply *m) | ||
| 2794 | { | ||
| 2795 | return (m->result < 0 || | ||
| 2796 | (m->flags & CEPH_OSD_FLAG_ONDISK) || | ||
| 2797 | !(req->r_flags & CEPH_OSD_FLAG_ONDISK)); | ||
| 2798 | } | ||
| 2799 | |||
| 2800 | /* | ||
| 2801 | * handle osd op reply. either call the callback if it is specified, | ||
| 2802 | * or do the completion to wake up the waiting thread. | ||
| 2803 | * | ||
| 2804 | * ->r_unsafe_callback is set? yes no | ||
| 2805 | * | ||
| 2806 | * first reply is OK (needed r_cb/r_completion, r_cb/r_completion, | ||
| 2807 | * any or needed/got safe) r_done_completion r_done_completion | ||
| 2808 | * | ||
| 2809 | * first reply is unsafe r_unsafe_cb(true) (nothing) | ||
| 2810 | * | ||
| 2811 | * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion, | ||
| 2812 | * r_done_completion r_done_completion | ||
| 2813 | */ | 2782 | */ |
| 2814 | static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) | 2783 | static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) |
| 2815 | { | 2784 | { |
| @@ -2818,7 +2787,6 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) | |||
| 2818 | struct MOSDOpReply m; | 2787 | struct MOSDOpReply m; |
| 2819 | u64 tid = le64_to_cpu(msg->hdr.tid); | 2788 | u64 tid = le64_to_cpu(msg->hdr.tid); |
| 2820 | u32 data_len = 0; | 2789 | u32 data_len = 0; |
| 2821 | bool already_acked; | ||
| 2822 | int ret; | 2790 | int ret; |
| 2823 | int i; | 2791 | int i; |
| 2824 | 2792 | ||
| @@ -2897,50 +2865,22 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) | |||
| 2897 | le32_to_cpu(msg->hdr.data_len), req->r_tid); | 2865 | le32_to_cpu(msg->hdr.data_len), req->r_tid); |
| 2898 | goto fail_request; | 2866 | goto fail_request; |
| 2899 | } | 2867 | } |
| 2900 | dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__, | 2868 | dout("%s req %p tid %llu result %d data_len %u\n", __func__, |
| 2901 | req, req->r_tid, req->r_got_reply, m.result, data_len); | 2869 | req, req->r_tid, m.result, data_len); |
| 2902 | |||
| 2903 | already_acked = req->r_got_reply; | ||
| 2904 | if (!already_acked) { | ||
| 2905 | req->r_result = m.result ?: data_len; | ||
| 2906 | req->r_replay_version = m.replay_version; /* struct */ | ||
| 2907 | req->r_got_reply = true; | ||
| 2908 | } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) { | ||
| 2909 | dout("req %p tid %llu dup ack\n", req, req->r_tid); | ||
| 2910 | goto out_unlock_session; | ||
| 2911 | } | ||
| 2912 | |||
| 2913 | if (done_request(req, &m)) { | ||
| 2914 | finish_request(req); | ||
| 2915 | if (req->r_linger) { | ||
| 2916 | WARN_ON(req->r_unsafe_callback); | ||
| 2917 | dout("req %p tid %llu cb (locked)\n", req, req->r_tid); | ||
| 2918 | __complete_request(req); | ||
| 2919 | } | ||
| 2920 | } | ||
| 2921 | 2870 | ||
| 2871 | /* | ||
| 2872 | * Since we only ever request ONDISK, we should only ever get | ||
| 2873 | * one (type of) reply back. | ||
| 2874 | */ | ||
| 2875 | WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK)); | ||
| 2876 | req->r_result = m.result ?: data_len; | ||
| 2877 | finish_request(req); | ||
| 2922 | mutex_unlock(&osd->lock); | 2878 | mutex_unlock(&osd->lock); |
| 2923 | up_read(&osdc->lock); | 2879 | up_read(&osdc->lock); |
| 2924 | 2880 | ||
| 2925 | if (done_request(req, &m)) { | 2881 | __complete_request(req); |
| 2926 | if (already_acked && req->r_unsafe_callback) { | 2882 | complete_all(&req->r_completion); |
| 2927 | dout("req %p tid %llu safe-cb\n", req, req->r_tid); | 2883 | ceph_osdc_put_request(req); |
| 2928 | req->r_unsafe_callback(req, false); | ||
| 2929 | } else if (!req->r_linger) { | ||
| 2930 | dout("req %p tid %llu cb\n", req, req->r_tid); | ||
| 2931 | __complete_request(req); | ||
| 2932 | } | ||
| 2933 | complete_all(&req->r_done_completion); | ||
| 2934 | ceph_osdc_put_request(req); | ||
| 2935 | } else { | ||
| 2936 | if (req->r_unsafe_callback) { | ||
| 2937 | dout("req %p tid %llu unsafe-cb\n", req, req->r_tid); | ||
| 2938 | req->r_unsafe_callback(req, true); | ||
| 2939 | } else { | ||
| 2940 | WARN_ON(1); | ||
| 2941 | } | ||
| 2942 | } | ||
| 2943 | |||
| 2944 | return; | 2884 | return; |
| 2945 | 2885 | ||
| 2946 | fail_request: | 2886 | fail_request: |
| @@ -3540,7 +3480,7 @@ again: | |||
| 3540 | up_read(&osdc->lock); | 3480 | up_read(&osdc->lock); |
| 3541 | dout("%s waiting on req %p tid %llu last_tid %llu\n", | 3481 | dout("%s waiting on req %p tid %llu last_tid %llu\n", |
| 3542 | __func__, req, req->r_tid, last_tid); | 3482 | __func__, req, req->r_tid, last_tid); |
| 3543 | wait_for_completion(&req->r_done_completion); | 3483 | wait_for_completion(&req->r_completion); |
| 3544 | ceph_osdc_put_request(req); | 3484 | ceph_osdc_put_request(req); |
| 3545 | goto again; | 3485 | goto again; |
| 3546 | } | 3486 | } |
| @@ -3599,7 +3539,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, | |||
| 3599 | 3539 | ||
| 3600 | ceph_oid_copy(&lreq->t.base_oid, oid); | 3540 | ceph_oid_copy(&lreq->t.base_oid, oid); |
| 3601 | ceph_oloc_copy(&lreq->t.base_oloc, oloc); | 3541 | ceph_oloc_copy(&lreq->t.base_oloc, oloc); |
| 3602 | lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; | 3542 | lreq->t.flags = CEPH_OSD_FLAG_WRITE; |
| 3603 | lreq->mtime = CURRENT_TIME; | 3543 | lreq->mtime = CURRENT_TIME; |
| 3604 | 3544 | ||
| 3605 | lreq->reg_req = alloc_linger_request(lreq); | 3545 | lreq->reg_req = alloc_linger_request(lreq); |
| @@ -3657,7 +3597,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, | |||
| 3657 | 3597 | ||
| 3658 | ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); | 3598 | ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); |
| 3659 | ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); | 3599 | ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); |
| 3660 | req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; | 3600 | req->r_flags = CEPH_OSD_FLAG_WRITE; |
| 3661 | req->r_mtime = CURRENT_TIME; | 3601 | req->r_mtime = CURRENT_TIME; |
| 3662 | osd_req_op_watch_init(req, 0, lreq->linger_id, | 3602 | osd_req_op_watch_init(req, 0, lreq->linger_id, |
| 3663 | CEPH_OSD_WATCH_OP_UNWATCH); | 3603 | CEPH_OSD_WATCH_OP_UNWATCH); |
| @@ -4022,7 +3962,7 @@ EXPORT_SYMBOL(ceph_osdc_maybe_request_map); | |||
| 4022 | * Execute an OSD class method on an object. | 3962 | * Execute an OSD class method on an object. |
| 4023 | * | 3963 | * |
| 4024 | * @flags: CEPH_OSD_FLAG_* | 3964 | * @flags: CEPH_OSD_FLAG_* |
| 4025 | * @resp_len: out param for reply length | 3965 | * @resp_len: in/out param for reply length |
| 4026 | */ | 3966 | */ |
| 4027 | int ceph_osdc_call(struct ceph_osd_client *osdc, | 3967 | int ceph_osdc_call(struct ceph_osd_client *osdc, |
| 4028 | struct ceph_object_id *oid, | 3968 | struct ceph_object_id *oid, |
| @@ -4035,6 +3975,9 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, | |||
| 4035 | struct ceph_osd_request *req; | 3975 | struct ceph_osd_request *req; |
| 4036 | int ret; | 3976 | int ret; |
| 4037 | 3977 | ||
| 3978 | if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE)) | ||
| 3979 | return -E2BIG; | ||
| 3980 | |||
| 4038 | req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); | 3981 | req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); |
| 4039 | if (!req) | 3982 | if (!req) |
| 4040 | return -ENOMEM; | 3983 | return -ENOMEM; |
| @@ -4053,7 +3996,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, | |||
| 4053 | 0, false, false); | 3996 | 0, false, false); |
| 4054 | if (resp_page) | 3997 | if (resp_page) |
| 4055 | osd_req_op_cls_response_data_pages(req, 0, &resp_page, | 3998 | osd_req_op_cls_response_data_pages(req, 0, &resp_page, |
| 4056 | PAGE_SIZE, 0, false, false); | 3999 | *resp_len, 0, false, false); |
| 4057 | 4000 | ||
| 4058 | ceph_osdc_start_request(osdc, req, false); | 4001 | ceph_osdc_start_request(osdc, req, false); |
| 4059 | ret = ceph_osdc_wait_request(osdc, req); | 4002 | ret = ceph_osdc_wait_request(osdc, req); |
| @@ -4220,8 +4163,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, | |||
| 4220 | int page_align = off & ~PAGE_MASK; | 4163 | int page_align = off & ~PAGE_MASK; |
| 4221 | 4164 | ||
| 4222 | req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, | 4165 | req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, |
| 4223 | CEPH_OSD_OP_WRITE, | 4166 | CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, |
| 4224 | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, | ||
| 4225 | snapc, truncate_seq, truncate_size, | 4167 | snapc, truncate_seq, truncate_size, |
| 4226 | true); | 4168 | true); |
| 4227 | if (IS_ERR(req)) | 4169 | if (IS_ERR(req)) |
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index d2436880b305..6824c0ec8373 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c | |||
| @@ -153,6 +153,32 @@ bad: | |||
| 153 | return -EINVAL; | 153 | return -EINVAL; |
| 154 | } | 154 | } |
| 155 | 155 | ||
| 156 | static void crush_finalize(struct crush_map *c) | ||
| 157 | { | ||
| 158 | __s32 b; | ||
| 159 | |||
| 160 | /* Space for the array of pointers to per-bucket workspace */ | ||
| 161 | c->working_size = sizeof(struct crush_work) + | ||
| 162 | c->max_buckets * sizeof(struct crush_work_bucket *); | ||
| 163 | |||
| 164 | for (b = 0; b < c->max_buckets; b++) { | ||
| 165 | if (!c->buckets[b]) | ||
| 166 | continue; | ||
| 167 | |||
| 168 | switch (c->buckets[b]->alg) { | ||
| 169 | default: | ||
| 170 | /* | ||
| 171 | * The base case, permutation variables and | ||
| 172 | * the pointer to the permutation array. | ||
| 173 | */ | ||
| 174 | c->working_size += sizeof(struct crush_work_bucket); | ||
| 175 | break; | ||
| 176 | } | ||
| 177 | /* Every bucket has a permutation array. */ | ||
| 178 | c->working_size += c->buckets[b]->size * sizeof(__u32); | ||
| 179 | } | ||
| 180 | } | ||
| 181 | |||
| 156 | static struct crush_map *crush_decode(void *pbyval, void *end) | 182 | static struct crush_map *crush_decode(void *pbyval, void *end) |
| 157 | { | 183 | { |
| 158 | struct crush_map *c; | 184 | struct crush_map *c; |
| @@ -246,10 +272,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end) | |||
| 246 | b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); | 272 | b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); |
| 247 | if (b->items == NULL) | 273 | if (b->items == NULL) |
| 248 | goto badmem; | 274 | goto badmem; |
| 249 | b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS); | ||
| 250 | if (b->perm == NULL) | ||
| 251 | goto badmem; | ||
| 252 | b->perm_n = 0; | ||
| 253 | 275 | ||
| 254 | ceph_decode_need(p, end, b->size*sizeof(u32), bad); | 276 | ceph_decode_need(p, end, b->size*sizeof(u32), bad); |
| 255 | for (j = 0; j < b->size; j++) | 277 | for (j = 0; j < b->size; j++) |
| @@ -368,6 +390,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end) | |||
| 368 | dout("crush decode tunable chooseleaf_stable = %d\n", | 390 | dout("crush decode tunable chooseleaf_stable = %d\n", |
| 369 | c->chooseleaf_stable); | 391 | c->chooseleaf_stable); |
| 370 | 392 | ||
| 393 | crush_finalize(c); | ||
| 394 | |||
| 371 | done: | 395 | done: |
| 372 | dout("crush_decode success\n"); | 396 | dout("crush_decode success\n"); |
| 373 | return c; | 397 | return c; |
| @@ -719,7 +743,7 @@ struct ceph_osdmap *ceph_osdmap_alloc(void) | |||
| 719 | map->pool_max = -1; | 743 | map->pool_max = -1; |
| 720 | map->pg_temp = RB_ROOT; | 744 | map->pg_temp = RB_ROOT; |
| 721 | map->primary_temp = RB_ROOT; | 745 | map->primary_temp = RB_ROOT; |
| 722 | mutex_init(&map->crush_scratch_mutex); | 746 | mutex_init(&map->crush_workspace_mutex); |
| 723 | 747 | ||
| 724 | return map; | 748 | return map; |
| 725 | } | 749 | } |
| @@ -753,6 +777,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) | |||
| 753 | kfree(map->osd_weight); | 777 | kfree(map->osd_weight); |
| 754 | kfree(map->osd_addr); | 778 | kfree(map->osd_addr); |
| 755 | kfree(map->osd_primary_affinity); | 779 | kfree(map->osd_primary_affinity); |
| 780 | kfree(map->crush_workspace); | ||
| 756 | kfree(map); | 781 | kfree(map); |
| 757 | } | 782 | } |
| 758 | 783 | ||
| @@ -808,6 +833,31 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) | |||
| 808 | return 0; | 833 | return 0; |
| 809 | } | 834 | } |
| 810 | 835 | ||
| 836 | static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) | ||
| 837 | { | ||
| 838 | void *workspace; | ||
| 839 | size_t work_size; | ||
| 840 | |||
| 841 | if (IS_ERR(crush)) | ||
| 842 | return PTR_ERR(crush); | ||
| 843 | |||
| 844 | work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); | ||
| 845 | dout("%s work_size %zu bytes\n", __func__, work_size); | ||
| 846 | workspace = kmalloc(work_size, GFP_NOIO); | ||
| 847 | if (!workspace) { | ||
| 848 | crush_destroy(crush); | ||
| 849 | return -ENOMEM; | ||
| 850 | } | ||
| 851 | crush_init_workspace(crush, workspace); | ||
| 852 | |||
| 853 | if (map->crush) | ||
| 854 | crush_destroy(map->crush); | ||
| 855 | kfree(map->crush_workspace); | ||
| 856 | map->crush = crush; | ||
| 857 | map->crush_workspace = workspace; | ||
| 858 | return 0; | ||
| 859 | } | ||
| 860 | |||
| 811 | #define OSDMAP_WRAPPER_COMPAT_VER 7 | 861 | #define OSDMAP_WRAPPER_COMPAT_VER 7 |
| 812 | #define OSDMAP_CLIENT_DATA_COMPAT_VER 1 | 862 | #define OSDMAP_CLIENT_DATA_COMPAT_VER 1 |
| 813 | 863 | ||
| @@ -1214,13 +1264,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) | |||
| 1214 | 1264 | ||
| 1215 | /* crush */ | 1265 | /* crush */ |
| 1216 | ceph_decode_32_safe(p, end, len, e_inval); | 1266 | ceph_decode_32_safe(p, end, len, e_inval); |
| 1217 | map->crush = crush_decode(*p, min(*p + len, end)); | 1267 | err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end))); |
| 1218 | if (IS_ERR(map->crush)) { | 1268 | if (err) |
| 1219 | err = PTR_ERR(map->crush); | ||
| 1220 | map->crush = NULL; | ||
| 1221 | goto bad; | 1269 | goto bad; |
| 1222 | } | ||
| 1223 | *p += len; | ||
| 1224 | 1270 | ||
| 1225 | /* ignore the rest */ | 1271 | /* ignore the rest */ |
| 1226 | *p = end; | 1272 | *p = end; |
| @@ -1375,7 +1421,6 @@ e_inval: | |||
| 1375 | struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | 1421 | struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, |
| 1376 | struct ceph_osdmap *map) | 1422 | struct ceph_osdmap *map) |
| 1377 | { | 1423 | { |
| 1378 | struct crush_map *newcrush = NULL; | ||
| 1379 | struct ceph_fsid fsid; | 1424 | struct ceph_fsid fsid; |
| 1380 | u32 epoch = 0; | 1425 | u32 epoch = 0; |
| 1381 | struct ceph_timespec modified; | 1426 | struct ceph_timespec modified; |
| @@ -1414,12 +1459,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
| 1414 | /* new crush? */ | 1459 | /* new crush? */ |
| 1415 | ceph_decode_32_safe(p, end, len, e_inval); | 1460 | ceph_decode_32_safe(p, end, len, e_inval); |
| 1416 | if (len > 0) { | 1461 | if (len > 0) { |
| 1417 | newcrush = crush_decode(*p, min(*p+len, end)); | 1462 | err = osdmap_set_crush(map, |
| 1418 | if (IS_ERR(newcrush)) { | 1463 | crush_decode(*p, min(*p + len, end))); |
| 1419 | err = PTR_ERR(newcrush); | 1464 | if (err) |
| 1420 | newcrush = NULL; | ||
| 1421 | goto bad; | 1465 | goto bad; |
| 1422 | } | ||
| 1423 | *p += len; | 1466 | *p += len; |
| 1424 | } | 1467 | } |
| 1425 | 1468 | ||
| @@ -1439,12 +1482,6 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
| 1439 | 1482 | ||
| 1440 | map->epoch++; | 1483 | map->epoch++; |
| 1441 | map->modified = modified; | 1484 | map->modified = modified; |
| 1442 | if (newcrush) { | ||
| 1443 | if (map->crush) | ||
| 1444 | crush_destroy(map->crush); | ||
| 1445 | map->crush = newcrush; | ||
| 1446 | newcrush = NULL; | ||
| 1447 | } | ||
| 1448 | 1485 | ||
| 1449 | /* new_pools */ | 1486 | /* new_pools */ |
| 1450 | err = decode_new_pools(p, end, map); | 1487 | err = decode_new_pools(p, end, map); |
| @@ -1505,8 +1542,6 @@ bad: | |||
| 1505 | print_hex_dump(KERN_DEBUG, "osdmap: ", | 1542 | print_hex_dump(KERN_DEBUG, "osdmap: ", |
| 1506 | DUMP_PREFIX_OFFSET, 16, 1, | 1543 | DUMP_PREFIX_OFFSET, 16, 1, |
| 1507 | start, end - start, true); | 1544 | start, end - start, true); |
| 1508 | if (newcrush) | ||
| 1509 | crush_destroy(newcrush); | ||
| 1510 | return ERR_PTR(err); | 1545 | return ERR_PTR(err); |
| 1511 | } | 1546 | } |
| 1512 | 1547 | ||
| @@ -1942,10 +1977,10 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, | |||
| 1942 | 1977 | ||
| 1943 | BUG_ON(result_max > CEPH_PG_MAX_SIZE); | 1978 | BUG_ON(result_max > CEPH_PG_MAX_SIZE); |
| 1944 | 1979 | ||
| 1945 | mutex_lock(&map->crush_scratch_mutex); | 1980 | mutex_lock(&map->crush_workspace_mutex); |
| 1946 | r = crush_do_rule(map->crush, ruleno, x, result, result_max, | 1981 | r = crush_do_rule(map->crush, ruleno, x, result, result_max, |
| 1947 | weight, weight_max, map->crush_scratch_ary); | 1982 | weight, weight_max, map->crush_workspace); |
| 1948 | mutex_unlock(&map->crush_scratch_mutex); | 1983 | mutex_unlock(&map->crush_workspace_mutex); |
| 1949 | 1984 | ||
| 1950 | return r; | 1985 | return r; |
| 1951 | } | 1986 | } |
| @@ -1978,8 +2013,14 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap, | |||
| 1978 | return; | 2013 | return; |
| 1979 | } | 2014 | } |
| 1980 | 2015 | ||
| 1981 | len = do_crush(osdmap, ruleno, pps, raw->osds, | 2016 | if (pi->size > ARRAY_SIZE(raw->osds)) { |
| 1982 | min_t(int, pi->size, ARRAY_SIZE(raw->osds)), | 2017 | pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n", |
| 2018 | pi->id, pi->crush_ruleset, pi->type, pi->size, | ||
| 2019 | ARRAY_SIZE(raw->osds)); | ||
| 2020 | return; | ||
| 2021 | } | ||
| 2022 | |||
| 2023 | len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size, | ||
| 1983 | osdmap->osd_weight, osdmap->max_osd); | 2024 | osdmap->osd_weight, osdmap->max_osd); |
| 1984 | if (len < 0) { | 2025 | if (len < 0) { |
| 1985 | pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", | 2026 | pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", |
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c index 154683f5f14c..705414e78ae0 100644 --- a/net/ceph/snapshot.c +++ b/net/ceph/snapshot.c | |||
| @@ -18,8 +18,6 @@ | |||
| 18 | * 02110-1301, USA. | 18 | * 02110-1301, USA. |
| 19 | */ | 19 | */ |
| 20 | 20 | ||
| 21 | #include <stddef.h> | ||
| 22 | |||
| 23 | #include <linux/types.h> | 21 | #include <linux/types.h> |
| 24 | #include <linux/export.h> | 22 | #include <linux/export.h> |
| 25 | #include <linux/ceph/libceph.h> | 23 | #include <linux/ceph/libceph.h> |
