diff options
-rw-r--r-- | Documentation/ABI/testing/sysfs-bus-rbd | 2 | ||||
-rw-r--r-- | drivers/block/rbd.c | 361 | ||||
-rw-r--r-- | fs/ceph/debugfs.c | 6 | ||||
-rw-r--r-- | fs/ceph/dir.c | 24 | ||||
-rw-r--r-- | fs/ceph/file.c | 10 | ||||
-rw-r--r-- | fs/ceph/inode.c | 25 | ||||
-rw-r--r-- | fs/ceph/super.c | 9 | ||||
-rw-r--r-- | fs/ceph/super.h | 66 | ||||
-rw-r--r-- | include/linux/ceph/ceph_fs.h | 19 | ||||
-rw-r--r-- | include/linux/ceph/libceph.h | 1 | ||||
-rw-r--r-- | include/linux/ceph/osd_client.h | 57 | ||||
-rw-r--r-- | include/linux/ceph/rados.h | 39 | ||||
-rw-r--r-- | net/ceph/armor.c | 4 | ||||
-rw-r--r-- | net/ceph/ceph_common.c | 1 | ||||
-rw-r--r-- | net/ceph/osd_client.c | 624 |
15 files changed, 1018 insertions, 230 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-rbd b/Documentation/ABI/testing/sysfs-bus-rbd index 90a87e2a572b..fa72ccb2282e 100644 --- a/Documentation/ABI/testing/sysfs-bus-rbd +++ b/Documentation/ABI/testing/sysfs-bus-rbd | |||
@@ -1,6 +1,6 @@ | |||
1 | What: /sys/bus/rbd/ | 1 | What: /sys/bus/rbd/ |
2 | Date: November 2010 | 2 | Date: November 2010 |
3 | Contact: Yehuda Sadeh <yehuda@hq.newdream.net>, | 3 | Contact: Yehuda Sadeh <yehuda@newdream.net>, |
4 | Sage Weil <sage@newdream.net> | 4 | Sage Weil <sage@newdream.net> |
5 | Description: | 5 | Description: |
6 | 6 | ||
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e1e38b11f48a..16dc3645291c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/ceph/osd_client.h> | 31 | #include <linux/ceph/osd_client.h> |
32 | #include <linux/ceph/mon_client.h> | 32 | #include <linux/ceph/mon_client.h> |
33 | #include <linux/ceph/decode.h> | 33 | #include <linux/ceph/decode.h> |
34 | #include <linux/parser.h> | ||
34 | 35 | ||
35 | #include <linux/kernel.h> | 36 | #include <linux/kernel.h> |
36 | #include <linux/device.h> | 37 | #include <linux/device.h> |
@@ -54,6 +55,8 @@ | |||
54 | 55 | ||
55 | #define DEV_NAME_LEN 32 | 56 | #define DEV_NAME_LEN 32 |
56 | 57 | ||
58 | #define RBD_NOTIFY_TIMEOUT_DEFAULT 10 | ||
59 | |||
57 | /* | 60 | /* |
58 | * block device image metadata (in-memory version) | 61 | * block device image metadata (in-memory version) |
59 | */ | 62 | */ |
@@ -71,6 +74,12 @@ struct rbd_image_header { | |||
71 | 74 | ||
72 | char *snap_names; | 75 | char *snap_names; |
73 | u64 *snap_sizes; | 76 | u64 *snap_sizes; |
77 | |||
78 | u64 obj_version; | ||
79 | }; | ||
80 | |||
81 | struct rbd_options { | ||
82 | int notify_timeout; | ||
74 | }; | 83 | }; |
75 | 84 | ||
76 | /* | 85 | /* |
@@ -78,6 +87,7 @@ struct rbd_image_header { | |||
78 | */ | 87 | */ |
79 | struct rbd_client { | 88 | struct rbd_client { |
80 | struct ceph_client *client; | 89 | struct ceph_client *client; |
90 | struct rbd_options *rbd_opts; | ||
81 | struct kref kref; | 91 | struct kref kref; |
82 | struct list_head node; | 92 | struct list_head node; |
83 | }; | 93 | }; |
@@ -124,6 +134,9 @@ struct rbd_device { | |||
124 | char pool_name[RBD_MAX_POOL_NAME_LEN]; | 134 | char pool_name[RBD_MAX_POOL_NAME_LEN]; |
125 | int poolid; | 135 | int poolid; |
126 | 136 | ||
137 | struct ceph_osd_event *watch_event; | ||
138 | struct ceph_osd_request *watch_request; | ||
139 | |||
127 | char snap_name[RBD_MAX_SNAP_NAME_LEN]; | 140 | char snap_name[RBD_MAX_SNAP_NAME_LEN]; |
128 | u32 cur_snap; /* index+1 of current snapshot within snap context | 141 | u32 cur_snap; /* index+1 of current snapshot within snap context |
129 | 0 - for the head */ | 142 | 0 - for the head */ |
@@ -177,6 +190,8 @@ static void rbd_put_dev(struct rbd_device *rbd_dev) | |||
177 | put_device(&rbd_dev->dev); | 190 | put_device(&rbd_dev->dev); |
178 | } | 191 | } |
179 | 192 | ||
193 | static int __rbd_update_snaps(struct rbd_device *rbd_dev); | ||
194 | |||
180 | static int rbd_open(struct block_device *bdev, fmode_t mode) | 195 | static int rbd_open(struct block_device *bdev, fmode_t mode) |
181 | { | 196 | { |
182 | struct gendisk *disk = bdev->bd_disk; | 197 | struct gendisk *disk = bdev->bd_disk; |
@@ -211,7 +226,8 @@ static const struct block_device_operations rbd_bd_ops = { | |||
211 | * Initialize an rbd client instance. | 226 | * Initialize an rbd client instance. |
212 | * We own *opt. | 227 | * We own *opt. |
213 | */ | 228 | */ |
214 | static struct rbd_client *rbd_client_create(struct ceph_options *opt) | 229 | static struct rbd_client *rbd_client_create(struct ceph_options *opt, |
230 | struct rbd_options *rbd_opts) | ||
215 | { | 231 | { |
216 | struct rbd_client *rbdc; | 232 | struct rbd_client *rbdc; |
217 | int ret = -ENOMEM; | 233 | int ret = -ENOMEM; |
@@ -233,6 +249,8 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt) | |||
233 | if (ret < 0) | 249 | if (ret < 0) |
234 | goto out_err; | 250 | goto out_err; |
235 | 251 | ||
252 | rbdc->rbd_opts = rbd_opts; | ||
253 | |||
236 | spin_lock(&node_lock); | 254 | spin_lock(&node_lock); |
237 | list_add_tail(&rbdc->node, &rbd_client_list); | 255 | list_add_tail(&rbdc->node, &rbd_client_list); |
238 | spin_unlock(&node_lock); | 256 | spin_unlock(&node_lock); |
@@ -267,6 +285,59 @@ static struct rbd_client *__rbd_client_find(struct ceph_options *opt) | |||
267 | } | 285 | } |
268 | 286 | ||
269 | /* | 287 | /* |
288 | * mount options | ||
289 | */ | ||
290 | enum { | ||
291 | Opt_notify_timeout, | ||
292 | Opt_last_int, | ||
293 | /* int args above */ | ||
294 | Opt_last_string, | ||
295 | /* string args above */ | ||
296 | }; | ||
297 | |||
298 | static match_table_t rbdopt_tokens = { | ||
299 | {Opt_notify_timeout, "notify_timeout=%d"}, | ||
300 | /* int args above */ | ||
301 | /* string args above */ | ||
302 | {-1, NULL} | ||
303 | }; | ||
304 | |||
305 | static int parse_rbd_opts_token(char *c, void *private) | ||
306 | { | ||
307 | struct rbd_options *rbdopt = private; | ||
308 | substring_t argstr[MAX_OPT_ARGS]; | ||
309 | int token, intval, ret; | ||
310 | |||
311 | token = match_token((char *)c, rbdopt_tokens, argstr); | ||
312 | if (token < 0) | ||
313 | return -EINVAL; | ||
314 | |||
315 | if (token < Opt_last_int) { | ||
316 | ret = match_int(&argstr[0], &intval); | ||
317 | if (ret < 0) { | ||
318 | pr_err("bad mount option arg (not int) " | ||
319 | "at '%s'\n", c); | ||
320 | return ret; | ||
321 | } | ||
322 | dout("got int token %d val %d\n", token, intval); | ||
323 | } else if (token > Opt_last_int && token < Opt_last_string) { | ||
324 | dout("got string token %d val %s\n", token, | ||
325 | argstr[0].from); | ||
326 | } else { | ||
327 | dout("got token %d\n", token); | ||
328 | } | ||
329 | |||
330 | switch (token) { | ||
331 | case Opt_notify_timeout: | ||
332 | rbdopt->notify_timeout = intval; | ||
333 | break; | ||
334 | default: | ||
335 | BUG_ON(token); | ||
336 | } | ||
337 | return 0; | ||
338 | } | ||
339 | |||
340 | /* | ||
270 | * Get a ceph client with specific addr and configuration, if one does | 341 | * Get a ceph client with specific addr and configuration, if one does |
271 | * not exist create it. | 342 | * not exist create it. |
272 | */ | 343 | */ |
@@ -276,11 +347,18 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, | |||
276 | struct rbd_client *rbdc; | 347 | struct rbd_client *rbdc; |
277 | struct ceph_options *opt; | 348 | struct ceph_options *opt; |
278 | int ret; | 349 | int ret; |
350 | struct rbd_options *rbd_opts; | ||
351 | |||
352 | rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL); | ||
353 | if (!rbd_opts) | ||
354 | return -ENOMEM; | ||
355 | |||
356 | rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; | ||
279 | 357 | ||
280 | ret = ceph_parse_options(&opt, options, mon_addr, | 358 | ret = ceph_parse_options(&opt, options, mon_addr, |
281 | mon_addr + strlen(mon_addr), NULL, NULL); | 359 | mon_addr + strlen(mon_addr), parse_rbd_opts_token, rbd_opts); |
282 | if (ret < 0) | 360 | if (ret < 0) |
283 | return ret; | 361 | goto done_err; |
284 | 362 | ||
285 | spin_lock(&node_lock); | 363 | spin_lock(&node_lock); |
286 | rbdc = __rbd_client_find(opt); | 364 | rbdc = __rbd_client_find(opt); |
@@ -296,13 +374,18 @@ static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr, | |||
296 | } | 374 | } |
297 | spin_unlock(&node_lock); | 375 | spin_unlock(&node_lock); |
298 | 376 | ||
299 | rbdc = rbd_client_create(opt); | 377 | rbdc = rbd_client_create(opt, rbd_opts); |
300 | if (IS_ERR(rbdc)) | 378 | if (IS_ERR(rbdc)) { |
301 | return PTR_ERR(rbdc); | 379 | ret = PTR_ERR(rbdc); |
380 | goto done_err; | ||
381 | } | ||
302 | 382 | ||
303 | rbd_dev->rbd_client = rbdc; | 383 | rbd_dev->rbd_client = rbdc; |
304 | rbd_dev->client = rbdc->client; | 384 | rbd_dev->client = rbdc->client; |
305 | return 0; | 385 | return 0; |
386 | done_err: | ||
387 | kfree(rbd_opts); | ||
388 | return ret; | ||
306 | } | 389 | } |
307 | 390 | ||
308 | /* | 391 | /* |
@@ -318,6 +401,7 @@ static void rbd_client_release(struct kref *kref) | |||
318 | spin_unlock(&node_lock); | 401 | spin_unlock(&node_lock); |
319 | 402 | ||
320 | ceph_destroy_client(rbdc->client); | 403 | ceph_destroy_client(rbdc->client); |
404 | kfree(rbdc->rbd_opts); | ||
321 | kfree(rbdc); | 405 | kfree(rbdc); |
322 | } | 406 | } |
323 | 407 | ||
@@ -666,7 +750,9 @@ static int rbd_do_request(struct request *rq, | |||
666 | struct ceph_osd_req_op *ops, | 750 | struct ceph_osd_req_op *ops, |
667 | int num_reply, | 751 | int num_reply, |
668 | void (*rbd_cb)(struct ceph_osd_request *req, | 752 | void (*rbd_cb)(struct ceph_osd_request *req, |
669 | struct ceph_msg *msg)) | 753 | struct ceph_msg *msg), |
754 | struct ceph_osd_request **linger_req, | ||
755 | u64 *ver) | ||
670 | { | 756 | { |
671 | struct ceph_osd_request *req; | 757 | struct ceph_osd_request *req; |
672 | struct ceph_file_layout *layout; | 758 | struct ceph_file_layout *layout; |
@@ -729,12 +815,20 @@ static int rbd_do_request(struct request *rq, | |||
729 | req->r_oid, req->r_oid_len); | 815 | req->r_oid, req->r_oid_len); |
730 | up_read(&header->snap_rwsem); | 816 | up_read(&header->snap_rwsem); |
731 | 817 | ||
818 | if (linger_req) { | ||
819 | ceph_osdc_set_request_linger(&dev->client->osdc, req); | ||
820 | *linger_req = req; | ||
821 | } | ||
822 | |||
732 | ret = ceph_osdc_start_request(&dev->client->osdc, req, false); | 823 | ret = ceph_osdc_start_request(&dev->client->osdc, req, false); |
733 | if (ret < 0) | 824 | if (ret < 0) |
734 | goto done_err; | 825 | goto done_err; |
735 | 826 | ||
736 | if (!rbd_cb) { | 827 | if (!rbd_cb) { |
737 | ret = ceph_osdc_wait_request(&dev->client->osdc, req); | 828 | ret = ceph_osdc_wait_request(&dev->client->osdc, req); |
829 | if (ver) | ||
830 | *ver = le64_to_cpu(req->r_reassert_version.version); | ||
831 | dout("reassert_ver=%lld\n", le64_to_cpu(req->r_reassert_version.version)); | ||
738 | ceph_osdc_put_request(req); | 832 | ceph_osdc_put_request(req); |
739 | } | 833 | } |
740 | return ret; | 834 | return ret; |
@@ -789,6 +883,11 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) | |||
789 | kfree(req_data); | 883 | kfree(req_data); |
790 | } | 884 | } |
791 | 885 | ||
886 | static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) | ||
887 | { | ||
888 | ceph_osdc_put_request(req); | ||
889 | } | ||
890 | |||
792 | /* | 891 | /* |
793 | * Do a synchronous ceph osd operation | 892 | * Do a synchronous ceph osd operation |
794 | */ | 893 | */ |
@@ -801,7 +900,9 @@ static int rbd_req_sync_op(struct rbd_device *dev, | |||
801 | int num_reply, | 900 | int num_reply, |
802 | const char *obj, | 901 | const char *obj, |
803 | u64 ofs, u64 len, | 902 | u64 ofs, u64 len, |
804 | char *buf) | 903 | char *buf, |
904 | struct ceph_osd_request **linger_req, | ||
905 | u64 *ver) | ||
805 | { | 906 | { |
806 | int ret; | 907 | int ret; |
807 | struct page **pages; | 908 | struct page **pages; |
@@ -833,7 +934,8 @@ static int rbd_req_sync_op(struct rbd_device *dev, | |||
833 | flags, | 934 | flags, |
834 | ops, | 935 | ops, |
835 | 2, | 936 | 2, |
836 | NULL); | 937 | NULL, |
938 | linger_req, ver); | ||
837 | if (ret < 0) | 939 | if (ret < 0) |
838 | goto done_ops; | 940 | goto done_ops; |
839 | 941 | ||
@@ -893,7 +995,7 @@ static int rbd_do_op(struct request *rq, | |||
893 | flags, | 995 | flags, |
894 | ops, | 996 | ops, |
895 | num_reply, | 997 | num_reply, |
896 | rbd_req_cb); | 998 | rbd_req_cb, 0, NULL); |
897 | done: | 999 | done: |
898 | kfree(seg_name); | 1000 | kfree(seg_name); |
899 | return ret; | 1001 | return ret; |
@@ -940,18 +1042,174 @@ static int rbd_req_sync_read(struct rbd_device *dev, | |||
940 | u64 snapid, | 1042 | u64 snapid, |
941 | const char *obj, | 1043 | const char *obj, |
942 | u64 ofs, u64 len, | 1044 | u64 ofs, u64 len, |
943 | char *buf) | 1045 | char *buf, |
1046 | u64 *ver) | ||
944 | { | 1047 | { |
945 | return rbd_req_sync_op(dev, NULL, | 1048 | return rbd_req_sync_op(dev, NULL, |
946 | (snapid ? snapid : CEPH_NOSNAP), | 1049 | (snapid ? snapid : CEPH_NOSNAP), |
947 | CEPH_OSD_OP_READ, | 1050 | CEPH_OSD_OP_READ, |
948 | CEPH_OSD_FLAG_READ, | 1051 | CEPH_OSD_FLAG_READ, |
949 | NULL, | 1052 | NULL, |
950 | 1, obj, ofs, len, buf); | 1053 | 1, obj, ofs, len, buf, NULL, ver); |
951 | } | 1054 | } |
952 | 1055 | ||
953 | /* | 1056 | /* |
954 | * Request sync osd read | 1057 | * Request sync osd watch |
1058 | */ | ||
1059 | static int rbd_req_sync_notify_ack(struct rbd_device *dev, | ||
1060 | u64 ver, | ||
1061 | u64 notify_id, | ||
1062 | const char *obj) | ||
1063 | { | ||
1064 | struct ceph_osd_req_op *ops; | ||
1065 | struct page **pages = NULL; | ||
1066 | int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0); | ||
1067 | if (ret < 0) | ||
1068 | return ret; | ||
1069 | |||
1070 | ops[0].watch.ver = cpu_to_le64(dev->header.obj_version); | ||
1071 | ops[0].watch.cookie = notify_id; | ||
1072 | ops[0].watch.flag = 0; | ||
1073 | |||
1074 | ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP, | ||
1075 | obj, 0, 0, NULL, | ||
1076 | pages, 0, | ||
1077 | CEPH_OSD_FLAG_READ, | ||
1078 | ops, | ||
1079 | 1, | ||
1080 | rbd_simple_req_cb, 0, NULL); | ||
1081 | |||
1082 | rbd_destroy_ops(ops); | ||
1083 | return ret; | ||
1084 | } | ||
1085 | |||
1086 | static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) | ||
1087 | { | ||
1088 | struct rbd_device *dev = (struct rbd_device *)data; | ||
1089 | if (!dev) | ||
1090 | return; | ||
1091 | |||
1092 | dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, | ||
1093 | notify_id, (int)opcode); | ||
1094 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | ||
1095 | __rbd_update_snaps(dev); | ||
1096 | mutex_unlock(&ctl_mutex); | ||
1097 | |||
1098 | rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); | ||
1099 | } | ||
1100 | |||
1101 | /* | ||
1102 | * Request sync osd watch | ||
1103 | */ | ||
1104 | static int rbd_req_sync_watch(struct rbd_device *dev, | ||
1105 | const char *obj, | ||
1106 | u64 ver) | ||
1107 | { | ||
1108 | struct ceph_osd_req_op *ops; | ||
1109 | struct ceph_osd_client *osdc = &dev->client->osdc; | ||
1110 | |||
1111 | int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); | ||
1112 | if (ret < 0) | ||
1113 | return ret; | ||
1114 | |||
1115 | ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, | ||
1116 | (void *)dev, &dev->watch_event); | ||
1117 | if (ret < 0) | ||
1118 | goto fail; | ||
1119 | |||
1120 | ops[0].watch.ver = cpu_to_le64(ver); | ||
1121 | ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); | ||
1122 | ops[0].watch.flag = 1; | ||
1123 | |||
1124 | ret = rbd_req_sync_op(dev, NULL, | ||
1125 | CEPH_NOSNAP, | ||
1126 | 0, | ||
1127 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | ||
1128 | ops, | ||
1129 | 1, obj, 0, 0, NULL, | ||
1130 | &dev->watch_request, NULL); | ||
1131 | |||
1132 | if (ret < 0) | ||
1133 | goto fail_event; | ||
1134 | |||
1135 | rbd_destroy_ops(ops); | ||
1136 | return 0; | ||
1137 | |||
1138 | fail_event: | ||
1139 | ceph_osdc_cancel_event(dev->watch_event); | ||
1140 | dev->watch_event = NULL; | ||
1141 | fail: | ||
1142 | rbd_destroy_ops(ops); | ||
1143 | return ret; | ||
1144 | } | ||
1145 | |||
1146 | struct rbd_notify_info { | ||
1147 | struct rbd_device *dev; | ||
1148 | }; | ||
1149 | |||
1150 | static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data) | ||
1151 | { | ||
1152 | struct rbd_device *dev = (struct rbd_device *)data; | ||
1153 | if (!dev) | ||
1154 | return; | ||
1155 | |||
1156 | dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, | ||
1157 | notify_id, (int)opcode); | ||
1158 | } | ||
1159 | |||
1160 | /* | ||
1161 | * Request sync osd notify | ||
1162 | */ | ||
1163 | static int rbd_req_sync_notify(struct rbd_device *dev, | ||
1164 | const char *obj) | ||
1165 | { | ||
1166 | struct ceph_osd_req_op *ops; | ||
1167 | struct ceph_osd_client *osdc = &dev->client->osdc; | ||
1168 | struct ceph_osd_event *event; | ||
1169 | struct rbd_notify_info info; | ||
1170 | int payload_len = sizeof(u32) + sizeof(u32); | ||
1171 | int ret; | ||
1172 | |||
1173 | ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len); | ||
1174 | if (ret < 0) | ||
1175 | return ret; | ||
1176 | |||
1177 | info.dev = dev; | ||
1178 | |||
1179 | ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1, | ||
1180 | (void *)&info, &event); | ||
1181 | if (ret < 0) | ||
1182 | goto fail; | ||
1183 | |||
1184 | ops[0].watch.ver = 1; | ||
1185 | ops[0].watch.flag = 1; | ||
1186 | ops[0].watch.cookie = event->cookie; | ||
1187 | ops[0].watch.prot_ver = RADOS_NOTIFY_VER; | ||
1188 | ops[0].watch.timeout = 12; | ||
1189 | |||
1190 | ret = rbd_req_sync_op(dev, NULL, | ||
1191 | CEPH_NOSNAP, | ||
1192 | 0, | ||
1193 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | ||
1194 | ops, | ||
1195 | 1, obj, 0, 0, NULL, NULL, NULL); | ||
1196 | if (ret < 0) | ||
1197 | goto fail_event; | ||
1198 | |||
1199 | ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT); | ||
1200 | dout("ceph_osdc_wait_event returned %d\n", ret); | ||
1201 | rbd_destroy_ops(ops); | ||
1202 | return 0; | ||
1203 | |||
1204 | fail_event: | ||
1205 | ceph_osdc_cancel_event(event); | ||
1206 | fail: | ||
1207 | rbd_destroy_ops(ops); | ||
1208 | return ret; | ||
1209 | } | ||
1210 | |||
1211 | /* | ||
1212 | * Request sync osd rollback | ||
955 | */ | 1213 | */ |
956 | static int rbd_req_sync_rollback_obj(struct rbd_device *dev, | 1214 | static int rbd_req_sync_rollback_obj(struct rbd_device *dev, |
957 | u64 snapid, | 1215 | u64 snapid, |
@@ -969,13 +1227,10 @@ static int rbd_req_sync_rollback_obj(struct rbd_device *dev, | |||
969 | 0, | 1227 | 0, |
970 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | 1228 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, |
971 | ops, | 1229 | ops, |
972 | 1, obj, 0, 0, NULL); | 1230 | 1, obj, 0, 0, NULL, NULL, NULL); |
973 | 1231 | ||
974 | rbd_destroy_ops(ops); | 1232 | rbd_destroy_ops(ops); |
975 | 1233 | ||
976 | if (ret < 0) | ||
977 | return ret; | ||
978 | |||
979 | return ret; | 1234 | return ret; |
980 | } | 1235 | } |
981 | 1236 | ||
@@ -987,7 +1242,8 @@ static int rbd_req_sync_exec(struct rbd_device *dev, | |||
987 | const char *cls, | 1242 | const char *cls, |
988 | const char *method, | 1243 | const char *method, |
989 | const char *data, | 1244 | const char *data, |
990 | int len) | 1245 | int len, |
1246 | u64 *ver) | ||
991 | { | 1247 | { |
992 | struct ceph_osd_req_op *ops; | 1248 | struct ceph_osd_req_op *ops; |
993 | int cls_len = strlen(cls); | 1249 | int cls_len = strlen(cls); |
@@ -1010,7 +1266,7 @@ static int rbd_req_sync_exec(struct rbd_device *dev, | |||
1010 | 0, | 1266 | 0, |
1011 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, | 1267 | CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, |
1012 | ops, | 1268 | ops, |
1013 | 1, obj, 0, 0, NULL); | 1269 | 1, obj, 0, 0, NULL, NULL, ver); |
1014 | 1270 | ||
1015 | rbd_destroy_ops(ops); | 1271 | rbd_destroy_ops(ops); |
1016 | 1272 | ||
@@ -1156,6 +1412,7 @@ static int rbd_read_header(struct rbd_device *rbd_dev, | |||
1156 | struct rbd_image_header_ondisk *dh; | 1412 | struct rbd_image_header_ondisk *dh; |
1157 | int snap_count = 0; | 1413 | int snap_count = 0; |
1158 | u64 snap_names_len = 0; | 1414 | u64 snap_names_len = 0; |
1415 | u64 ver; | ||
1159 | 1416 | ||
1160 | while (1) { | 1417 | while (1) { |
1161 | int len = sizeof(*dh) + | 1418 | int len = sizeof(*dh) + |
@@ -1171,7 +1428,7 @@ static int rbd_read_header(struct rbd_device *rbd_dev, | |||
1171 | NULL, CEPH_NOSNAP, | 1428 | NULL, CEPH_NOSNAP, |
1172 | rbd_dev->obj_md_name, | 1429 | rbd_dev->obj_md_name, |
1173 | 0, len, | 1430 | 0, len, |
1174 | (char *)dh); | 1431 | (char *)dh, &ver); |
1175 | if (rc < 0) | 1432 | if (rc < 0) |
1176 | goto out_dh; | 1433 | goto out_dh; |
1177 | 1434 | ||
@@ -1188,6 +1445,7 @@ static int rbd_read_header(struct rbd_device *rbd_dev, | |||
1188 | } | 1445 | } |
1189 | break; | 1446 | break; |
1190 | } | 1447 | } |
1448 | header->obj_version = ver; | ||
1191 | 1449 | ||
1192 | out_dh: | 1450 | out_dh: |
1193 | kfree(dh); | 1451 | kfree(dh); |
@@ -1205,6 +1463,7 @@ static int rbd_header_add_snap(struct rbd_device *dev, | |||
1205 | u64 new_snapid; | 1463 | u64 new_snapid; |
1206 | int ret; | 1464 | int ret; |
1207 | void *data, *data_start, *data_end; | 1465 | void *data, *data_start, *data_end; |
1466 | u64 ver; | ||
1208 | 1467 | ||
1209 | /* we should create a snapshot only if we're pointing at the head */ | 1468 | /* we should create a snapshot only if we're pointing at the head */ |
1210 | if (dev->cur_snap) | 1469 | if (dev->cur_snap) |
@@ -1227,7 +1486,7 @@ static int rbd_header_add_snap(struct rbd_device *dev, | |||
1227 | ceph_encode_64_safe(&data, data_end, new_snapid, bad); | 1486 | ceph_encode_64_safe(&data, data_end, new_snapid, bad); |
1228 | 1487 | ||
1229 | ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", | 1488 | ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", |
1230 | data_start, data - data_start); | 1489 | data_start, data - data_start, &ver); |
1231 | 1490 | ||
1232 | kfree(data_start); | 1491 | kfree(data_start); |
1233 | 1492 | ||
@@ -1259,6 +1518,7 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev) | |||
1259 | int ret; | 1518 | int ret; |
1260 | struct rbd_image_header h; | 1519 | struct rbd_image_header h; |
1261 | u64 snap_seq; | 1520 | u64 snap_seq; |
1521 | int follow_seq = 0; | ||
1262 | 1522 | ||
1263 | ret = rbd_read_header(rbd_dev, &h); | 1523 | ret = rbd_read_header(rbd_dev, &h); |
1264 | if (ret < 0) | 1524 | if (ret < 0) |
@@ -1267,6 +1527,11 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev) | |||
1267 | down_write(&rbd_dev->header.snap_rwsem); | 1527 | down_write(&rbd_dev->header.snap_rwsem); |
1268 | 1528 | ||
1269 | snap_seq = rbd_dev->header.snapc->seq; | 1529 | snap_seq = rbd_dev->header.snapc->seq; |
1530 | if (rbd_dev->header.total_snaps && | ||
1531 | rbd_dev->header.snapc->snaps[0] == snap_seq) | ||
1532 | /* pointing at the head, will need to follow that | ||
1533 | if head moves */ | ||
1534 | follow_seq = 1; | ||
1270 | 1535 | ||
1271 | kfree(rbd_dev->header.snapc); | 1536 | kfree(rbd_dev->header.snapc); |
1272 | kfree(rbd_dev->header.snap_names); | 1537 | kfree(rbd_dev->header.snap_names); |
@@ -1277,7 +1542,10 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev) | |||
1277 | rbd_dev->header.snap_names = h.snap_names; | 1542 | rbd_dev->header.snap_names = h.snap_names; |
1278 | rbd_dev->header.snap_names_len = h.snap_names_len; | 1543 | rbd_dev->header.snap_names_len = h.snap_names_len; |
1279 | rbd_dev->header.snap_sizes = h.snap_sizes; | 1544 | rbd_dev->header.snap_sizes = h.snap_sizes; |
1280 | rbd_dev->header.snapc->seq = snap_seq; | 1545 | if (follow_seq) |
1546 | rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0]; | ||
1547 | else | ||
1548 | rbd_dev->header.snapc->seq = snap_seq; | ||
1281 | 1549 | ||
1282 | ret = __rbd_init_snaps_header(rbd_dev); | 1550 | ret = __rbd_init_snaps_header(rbd_dev); |
1283 | 1551 | ||
@@ -1699,7 +1967,28 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev) | |||
1699 | device_unregister(&rbd_dev->dev); | 1967 | device_unregister(&rbd_dev->dev); |
1700 | } | 1968 | } |
1701 | 1969 | ||
1702 | static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) | 1970 | static int rbd_init_watch_dev(struct rbd_device *rbd_dev) |
1971 | { | ||
1972 | int ret, rc; | ||
1973 | |||
1974 | do { | ||
1975 | ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name, | ||
1976 | rbd_dev->header.obj_version); | ||
1977 | if (ret == -ERANGE) { | ||
1978 | mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); | ||
1979 | rc = __rbd_update_snaps(rbd_dev); | ||
1980 | mutex_unlock(&ctl_mutex); | ||
1981 | if (rc < 0) | ||
1982 | return rc; | ||
1983 | } | ||
1984 | } while (ret == -ERANGE); | ||
1985 | |||
1986 | return ret; | ||
1987 | } | ||
1988 | |||
1989 | static ssize_t rbd_add(struct bus_type *bus, | ||
1990 | const char *buf, | ||
1991 | size_t count) | ||
1703 | { | 1992 | { |
1704 | struct ceph_osd_client *osdc; | 1993 | struct ceph_osd_client *osdc; |
1705 | struct rbd_device *rbd_dev; | 1994 | struct rbd_device *rbd_dev; |
@@ -1797,6 +2086,10 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) | |||
1797 | if (rc) | 2086 | if (rc) |
1798 | goto err_out_bus; | 2087 | goto err_out_bus; |
1799 | 2088 | ||
2089 | rc = rbd_init_watch_dev(rbd_dev); | ||
2090 | if (rc) | ||
2091 | goto err_out_bus; | ||
2092 | |||
1800 | return count; | 2093 | return count; |
1801 | 2094 | ||
1802 | err_out_bus: | 2095 | err_out_bus: |
@@ -1849,6 +2142,12 @@ static void rbd_dev_release(struct device *dev) | |||
1849 | struct rbd_device *rbd_dev = | 2142 | struct rbd_device *rbd_dev = |
1850 | container_of(dev, struct rbd_device, dev); | 2143 | container_of(dev, struct rbd_device, dev); |
1851 | 2144 | ||
2145 | if (rbd_dev->watch_request) | ||
2146 | ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc, | ||
2147 | rbd_dev->watch_request); | ||
2148 | if (rbd_dev->watch_event) | ||
2149 | ceph_osdc_cancel_event(rbd_dev->watch_event); | ||
2150 | |||
1852 | rbd_put_client(rbd_dev); | 2151 | rbd_put_client(rbd_dev); |
1853 | 2152 | ||
1854 | /* clean up and free blkdev */ | 2153 | /* clean up and free blkdev */ |
@@ -1914,14 +2213,24 @@ static ssize_t rbd_snap_add(struct device *dev, | |||
1914 | ret = rbd_header_add_snap(rbd_dev, | 2213 | ret = rbd_header_add_snap(rbd_dev, |
1915 | name, GFP_KERNEL); | 2214 | name, GFP_KERNEL); |
1916 | if (ret < 0) | 2215 | if (ret < 0) |
1917 | goto done_unlock; | 2216 | goto err_unlock; |
1918 | 2217 | ||
1919 | ret = __rbd_update_snaps(rbd_dev); | 2218 | ret = __rbd_update_snaps(rbd_dev); |
1920 | if (ret < 0) | 2219 | if (ret < 0) |
1921 | goto done_unlock; | 2220 | goto err_unlock; |
2221 | |||
2222 | /* shouldn't hold ctl_mutex when notifying.. notify might | ||
2223 | trigger a watch callback that would need to get that mutex */ | ||
2224 | mutex_unlock(&ctl_mutex); | ||
2225 | |||
2226 | /* make a best effort, don't error if failed */ | ||
2227 | rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name); | ||
1922 | 2228 | ||
1923 | ret = count; | 2229 | ret = count; |
1924 | done_unlock: | 2230 | kfree(name); |
2231 | return ret; | ||
2232 | |||
2233 | err_unlock: | ||
1925 | mutex_unlock(&ctl_mutex); | 2234 | mutex_unlock(&ctl_mutex); |
1926 | kfree(name); | 2235 | kfree(name); |
1927 | return ret; | 2236 | return ret; |
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 08f65faac112..0dba6915712b 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c | |||
@@ -210,8 +210,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) | |||
210 | if (!fsc->debugfs_congestion_kb) | 210 | if (!fsc->debugfs_congestion_kb) |
211 | goto out; | 211 | goto out; |
212 | 212 | ||
213 | dout("a\n"); | ||
214 | |||
215 | snprintf(name, sizeof(name), "../../bdi/%s", | 213 | snprintf(name, sizeof(name), "../../bdi/%s", |
216 | dev_name(fsc->backing_dev_info.dev)); | 214 | dev_name(fsc->backing_dev_info.dev)); |
217 | fsc->debugfs_bdi = | 215 | fsc->debugfs_bdi = |
@@ -221,7 +219,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) | |||
221 | if (!fsc->debugfs_bdi) | 219 | if (!fsc->debugfs_bdi) |
222 | goto out; | 220 | goto out; |
223 | 221 | ||
224 | dout("b\n"); | ||
225 | fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", | 222 | fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", |
226 | 0600, | 223 | 0600, |
227 | fsc->client->debugfs_dir, | 224 | fsc->client->debugfs_dir, |
@@ -230,7 +227,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) | |||
230 | if (!fsc->debugfs_mdsmap) | 227 | if (!fsc->debugfs_mdsmap) |
231 | goto out; | 228 | goto out; |
232 | 229 | ||
233 | dout("ca\n"); | ||
234 | fsc->debugfs_mdsc = debugfs_create_file("mdsc", | 230 | fsc->debugfs_mdsc = debugfs_create_file("mdsc", |
235 | 0600, | 231 | 0600, |
236 | fsc->client->debugfs_dir, | 232 | fsc->client->debugfs_dir, |
@@ -239,7 +235,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) | |||
239 | if (!fsc->debugfs_mdsc) | 235 | if (!fsc->debugfs_mdsc) |
240 | goto out; | 236 | goto out; |
241 | 237 | ||
242 | dout("da\n"); | ||
243 | fsc->debugfs_caps = debugfs_create_file("caps", | 238 | fsc->debugfs_caps = debugfs_create_file("caps", |
244 | 0400, | 239 | 0400, |
245 | fsc->client->debugfs_dir, | 240 | fsc->client->debugfs_dir, |
@@ -248,7 +243,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) | |||
248 | if (!fsc->debugfs_caps) | 243 | if (!fsc->debugfs_caps) |
249 | goto out; | 244 | goto out; |
250 | 245 | ||
251 | dout("ea\n"); | ||
252 | fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", | 246 | fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", |
253 | 0600, | 247 | 0600, |
254 | fsc->client->debugfs_dir, | 248 | fsc->client->debugfs_dir, |
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index ebafa65a29b6..1a867a3601ae 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c | |||
@@ -161,7 +161,7 @@ more: | |||
161 | filp->f_pos = di->offset; | 161 | filp->f_pos = di->offset; |
162 | err = filldir(dirent, dentry->d_name.name, | 162 | err = filldir(dirent, dentry->d_name.name, |
163 | dentry->d_name.len, di->offset, | 163 | dentry->d_name.len, di->offset, |
164 | dentry->d_inode->i_ino, | 164 | ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), |
165 | dentry->d_inode->i_mode >> 12); | 165 | dentry->d_inode->i_mode >> 12); |
166 | 166 | ||
167 | if (last) { | 167 | if (last) { |
@@ -245,15 +245,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir) | |||
245 | 245 | ||
246 | dout("readdir off 0 -> '.'\n"); | 246 | dout("readdir off 0 -> '.'\n"); |
247 | if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), | 247 | if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), |
248 | inode->i_ino, inode->i_mode >> 12) < 0) | 248 | ceph_translate_ino(inode->i_sb, inode->i_ino), |
249 | inode->i_mode >> 12) < 0) | ||
249 | return 0; | 250 | return 0; |
250 | filp->f_pos = 1; | 251 | filp->f_pos = 1; |
251 | off = 1; | 252 | off = 1; |
252 | } | 253 | } |
253 | if (filp->f_pos == 1) { | 254 | if (filp->f_pos == 1) { |
255 | ino_t ino = filp->f_dentry->d_parent->d_inode->i_ino; | ||
254 | dout("readdir off 1 -> '..'\n"); | 256 | dout("readdir off 1 -> '..'\n"); |
255 | if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), | 257 | if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), |
256 | filp->f_dentry->d_parent->d_inode->i_ino, | 258 | ceph_translate_ino(inode->i_sb, ino), |
257 | inode->i_mode >> 12) < 0) | 259 | inode->i_mode >> 12) < 0) |
258 | return 0; | 260 | return 0; |
259 | filp->f_pos = 2; | 261 | filp->f_pos = 2; |
@@ -377,7 +379,8 @@ more: | |||
377 | if (filldir(dirent, | 379 | if (filldir(dirent, |
378 | rinfo->dir_dname[off - fi->offset], | 380 | rinfo->dir_dname[off - fi->offset], |
379 | rinfo->dir_dname_len[off - fi->offset], | 381 | rinfo->dir_dname_len[off - fi->offset], |
380 | pos, ino, ftype) < 0) { | 382 | pos, |
383 | ceph_translate_ino(inode->i_sb, ino), ftype) < 0) { | ||
381 | dout("filldir stopping us...\n"); | 384 | dout("filldir stopping us...\n"); |
382 | return 0; | 385 | return 0; |
383 | } | 386 | } |
@@ -1024,14 +1027,13 @@ out_touch: | |||
1024 | } | 1027 | } |
1025 | 1028 | ||
1026 | /* | 1029 | /* |
1027 | * When a dentry is released, clear the dir I_COMPLETE if it was part | 1030 | * Release our ceph_dentry_info. |
1028 | * of the current dir gen or if this is in the snapshot namespace. | ||
1029 | */ | 1031 | */ |
1030 | static void ceph_dentry_release(struct dentry *dentry) | 1032 | static void ceph_d_release(struct dentry *dentry) |
1031 | { | 1033 | { |
1032 | struct ceph_dentry_info *di = ceph_dentry(dentry); | 1034 | struct ceph_dentry_info *di = ceph_dentry(dentry); |
1033 | 1035 | ||
1034 | dout("dentry_release %p\n", dentry); | 1036 | dout("d_release %p\n", dentry); |
1035 | if (di) { | 1037 | if (di) { |
1036 | ceph_dentry_lru_del(dentry); | 1038 | ceph_dentry_lru_del(dentry); |
1037 | if (di->lease_session) | 1039 | if (di->lease_session) |
@@ -1256,14 +1258,14 @@ const struct inode_operations ceph_dir_iops = { | |||
1256 | 1258 | ||
1257 | const struct dentry_operations ceph_dentry_ops = { | 1259 | const struct dentry_operations ceph_dentry_ops = { |
1258 | .d_revalidate = ceph_d_revalidate, | 1260 | .d_revalidate = ceph_d_revalidate, |
1259 | .d_release = ceph_dentry_release, | 1261 | .d_release = ceph_d_release, |
1260 | }; | 1262 | }; |
1261 | 1263 | ||
1262 | const struct dentry_operations ceph_snapdir_dentry_ops = { | 1264 | const struct dentry_operations ceph_snapdir_dentry_ops = { |
1263 | .d_revalidate = ceph_snapdir_d_revalidate, | 1265 | .d_revalidate = ceph_snapdir_d_revalidate, |
1264 | .d_release = ceph_dentry_release, | 1266 | .d_release = ceph_d_release, |
1265 | }; | 1267 | }; |
1266 | 1268 | ||
1267 | const struct dentry_operations ceph_snap_dentry_ops = { | 1269 | const struct dentry_operations ceph_snap_dentry_ops = { |
1268 | .d_release = ceph_dentry_release, | 1270 | .d_release = ceph_d_release, |
1269 | }; | 1271 | }; |
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7d0e4a82d898..159b512d5a27 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
@@ -564,11 +564,19 @@ more: | |||
564 | * start_request so that a tid has been assigned. | 564 | * start_request so that a tid has been assigned. |
565 | */ | 565 | */ |
566 | spin_lock(&ci->i_unsafe_lock); | 566 | spin_lock(&ci->i_unsafe_lock); |
567 | list_add(&req->r_unsafe_item, &ci->i_unsafe_writes); | 567 | list_add_tail(&req->r_unsafe_item, |
568 | &ci->i_unsafe_writes); | ||
568 | spin_unlock(&ci->i_unsafe_lock); | 569 | spin_unlock(&ci->i_unsafe_lock); |
569 | ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); | 570 | ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); |
570 | } | 571 | } |
572 | |||
571 | ret = ceph_osdc_wait_request(&fsc->client->osdc, req); | 573 | ret = ceph_osdc_wait_request(&fsc->client->osdc, req); |
574 | if (ret < 0 && req->r_safe_callback) { | ||
575 | spin_lock(&ci->i_unsafe_lock); | ||
576 | list_del_init(&req->r_unsafe_item); | ||
577 | spin_unlock(&ci->i_unsafe_lock); | ||
578 | ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); | ||
579 | } | ||
572 | } | 580 | } |
573 | 581 | ||
574 | if (file->f_flags & O_DIRECT) | 582 | if (file->f_flags & O_DIRECT) |
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 193bfa5e9cbd..b54c97da1c43 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c | |||
@@ -36,6 +36,13 @@ static void ceph_vmtruncate_work(struct work_struct *work); | |||
36 | /* | 36 | /* |
37 | * find or create an inode, given the ceph ino number | 37 | * find or create an inode, given the ceph ino number |
38 | */ | 38 | */ |
39 | static int ceph_set_ino_cb(struct inode *inode, void *data) | ||
40 | { | ||
41 | ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; | ||
42 | inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); | ||
43 | return 0; | ||
44 | } | ||
45 | |||
39 | struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) | 46 | struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) |
40 | { | 47 | { |
41 | struct inode *inode; | 48 | struct inode *inode; |
@@ -1030,9 +1037,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, | |||
1030 | dout("fill_trace doing d_move %p -> %p\n", | 1037 | dout("fill_trace doing d_move %p -> %p\n", |
1031 | req->r_old_dentry, dn); | 1038 | req->r_old_dentry, dn); |
1032 | 1039 | ||
1033 | /* d_move screws up d_subdirs order */ | ||
1034 | ceph_i_clear(dir, CEPH_I_COMPLETE); | ||
1035 | |||
1036 | d_move(req->r_old_dentry, dn); | 1040 | d_move(req->r_old_dentry, dn); |
1037 | dout(" src %p '%.*s' dst %p '%.*s'\n", | 1041 | dout(" src %p '%.*s' dst %p '%.*s'\n", |
1038 | req->r_old_dentry, | 1042 | req->r_old_dentry, |
@@ -1044,12 +1048,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, | |||
1044 | rehashing bug in vfs_rename_dir */ | 1048 | rehashing bug in vfs_rename_dir */ |
1045 | ceph_invalidate_dentry_lease(dn); | 1049 | ceph_invalidate_dentry_lease(dn); |
1046 | 1050 | ||
1047 | /* take overwritten dentry's readdir offset */ | 1051 | /* |
1048 | dout("dn %p gets %p offset %lld (old offset %lld)\n", | 1052 | * d_move() puts the renamed dentry at the end of |
1049 | req->r_old_dentry, dn, ceph_dentry(dn)->offset, | 1053 | * d_subdirs. We need to assign it an appropriate |
1054 | * directory offset so we can behave when holding | ||
1055 | * I_COMPLETE. | ||
1056 | */ | ||
1057 | ceph_set_dentry_offset(req->r_old_dentry); | ||
1058 | dout("dn %p gets new offset %lld\n", req->r_old_dentry, | ||
1050 | ceph_dentry(req->r_old_dentry)->offset); | 1059 | ceph_dentry(req->r_old_dentry)->offset); |
1051 | ceph_dentry(req->r_old_dentry)->offset = | ||
1052 | ceph_dentry(dn)->offset; | ||
1053 | 1060 | ||
1054 | dn = req->r_old_dentry; /* use old_dentry */ | 1061 | dn = req->r_old_dentry; /* use old_dentry */ |
1055 | in = dn->d_inode; | 1062 | in = dn->d_inode; |
@@ -1809,7 +1816,7 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, | |||
1809 | err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL); | 1816 | err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL); |
1810 | if (!err) { | 1817 | if (!err) { |
1811 | generic_fillattr(inode, stat); | 1818 | generic_fillattr(inode, stat); |
1812 | stat->ino = inode->i_ino; | 1819 | stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); |
1813 | if (ceph_snap(inode) != CEPH_NOSNAP) | 1820 | if (ceph_snap(inode) != CEPH_NOSNAP) |
1814 | stat->dev = ceph_snap(inode); | 1821 | stat->dev = ceph_snap(inode); |
1815 | else | 1822 | else |
diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9c5085465a63..a9e78b4a258c 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c | |||
@@ -131,6 +131,7 @@ enum { | |||
131 | Opt_rbytes, | 131 | Opt_rbytes, |
132 | Opt_norbytes, | 132 | Opt_norbytes, |
133 | Opt_noasyncreaddir, | 133 | Opt_noasyncreaddir, |
134 | Opt_ino32, | ||
134 | }; | 135 | }; |
135 | 136 | ||
136 | static match_table_t fsopt_tokens = { | 137 | static match_table_t fsopt_tokens = { |
@@ -150,6 +151,7 @@ static match_table_t fsopt_tokens = { | |||
150 | {Opt_rbytes, "rbytes"}, | 151 | {Opt_rbytes, "rbytes"}, |
151 | {Opt_norbytes, "norbytes"}, | 152 | {Opt_norbytes, "norbytes"}, |
152 | {Opt_noasyncreaddir, "noasyncreaddir"}, | 153 | {Opt_noasyncreaddir, "noasyncreaddir"}, |
154 | {Opt_ino32, "ino32"}, | ||
153 | {-1, NULL} | 155 | {-1, NULL} |
154 | }; | 156 | }; |
155 | 157 | ||
@@ -225,6 +227,9 @@ static int parse_fsopt_token(char *c, void *private) | |||
225 | case Opt_noasyncreaddir: | 227 | case Opt_noasyncreaddir: |
226 | fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; | 228 | fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; |
227 | break; | 229 | break; |
230 | case Opt_ino32: | ||
231 | fsopt->flags |= CEPH_MOUNT_OPT_INO32; | ||
232 | break; | ||
228 | default: | 233 | default: |
229 | BUG_ON(token); | 234 | BUG_ON(token); |
230 | } | 235 | } |
@@ -288,7 +293,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, | |||
288 | fsopt->sb_flags = flags; | 293 | fsopt->sb_flags = flags; |
289 | fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; | 294 | fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; |
290 | 295 | ||
291 | fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; | 296 | fsopt->rsize = CEPH_RSIZE_DEFAULT; |
292 | fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); | 297 | fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); |
293 | fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; | 298 | fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; |
294 | fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; | 299 | fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; |
@@ -370,7 +375,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt) | |||
370 | 375 | ||
371 | if (fsopt->wsize) | 376 | if (fsopt->wsize) |
372 | seq_printf(m, ",wsize=%d", fsopt->wsize); | 377 | seq_printf(m, ",wsize=%d", fsopt->wsize); |
373 | if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT) | 378 | if (fsopt->rsize != CEPH_RSIZE_DEFAULT) |
374 | seq_printf(m, ",rsize=%d", fsopt->rsize); | 379 | seq_printf(m, ",rsize=%d", fsopt->rsize); |
375 | if (fsopt->congestion_kb != default_congestion_kb()) | 380 | if (fsopt->congestion_kb != default_congestion_kb()) |
376 | seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); | 381 | seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); |
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 20b907d76ae2..619fe719968f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
@@ -27,6 +27,7 @@ | |||
27 | #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ | 27 | #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ |
28 | #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ | 28 | #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ |
29 | #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ | 29 | #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ |
30 | #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ | ||
30 | 31 | ||
31 | #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) | 32 | #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) |
32 | 33 | ||
@@ -35,6 +36,7 @@ | |||
35 | #define ceph_test_mount_opt(fsc, opt) \ | 36 | #define ceph_test_mount_opt(fsc, opt) \ |
36 | (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) | 37 | (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) |
37 | 38 | ||
39 | #define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */ | ||
38 | #define CEPH_MAX_READDIR_DEFAULT 1024 | 40 | #define CEPH_MAX_READDIR_DEFAULT 1024 |
39 | #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) | 41 | #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) |
40 | #define CEPH_SNAPDIRNAME_DEFAULT ".snap" | 42 | #define CEPH_SNAPDIRNAME_DEFAULT ".snap" |
@@ -319,6 +321,16 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode) | |||
319 | return container_of(inode, struct ceph_inode_info, vfs_inode); | 321 | return container_of(inode, struct ceph_inode_info, vfs_inode); |
320 | } | 322 | } |
321 | 323 | ||
324 | static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) | ||
325 | { | ||
326 | return (struct ceph_fs_client *)inode->i_sb->s_fs_info; | ||
327 | } | ||
328 | |||
329 | static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) | ||
330 | { | ||
331 | return (struct ceph_fs_client *)sb->s_fs_info; | ||
332 | } | ||
333 | |||
322 | static inline struct ceph_vino ceph_vino(struct inode *inode) | 334 | static inline struct ceph_vino ceph_vino(struct inode *inode) |
323 | { | 335 | { |
324 | return ceph_inode(inode)->i_vino; | 336 | return ceph_inode(inode)->i_vino; |
@@ -327,19 +339,49 @@ static inline struct ceph_vino ceph_vino(struct inode *inode) | |||
327 | /* | 339 | /* |
328 | * ino_t is <64 bits on many architectures, blech. | 340 | * ino_t is <64 bits on many architectures, blech. |
329 | * | 341 | * |
330 | * don't include snap in ino hash, at least for now. | 342 | * i_ino (kernel inode) st_ino (userspace) |
343 | * i386 32 32 | ||
344 | * x86_64+ino32 64 32 | ||
345 | * x86_64 64 64 | ||
346 | */ | ||
347 | static inline u32 ceph_ino_to_ino32(ino_t ino) | ||
348 | { | ||
349 | ino ^= ino >> (sizeof(ino) * 8 - 32); | ||
350 | if (!ino) | ||
351 | ino = 1; | ||
352 | return ino; | ||
353 | } | ||
354 | |||
355 | /* | ||
356 | * kernel i_ino value | ||
331 | */ | 357 | */ |
332 | static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) | 358 | static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) |
333 | { | 359 | { |
334 | ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ | 360 | ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ |
335 | #if BITS_PER_LONG == 32 | 361 | #if BITS_PER_LONG == 32 |
336 | ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; | 362 | ino = ceph_ino_to_ino32(ino); |
337 | if (!ino) | ||
338 | ino = 1; | ||
339 | #endif | 363 | #endif |
340 | return ino; | 364 | return ino; |
341 | } | 365 | } |
342 | 366 | ||
367 | /* | ||
368 | * user-visible ino (stat, filldir) | ||
369 | */ | ||
370 | #if BITS_PER_LONG == 32 | ||
371 | static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) | ||
372 | { | ||
373 | return ino; | ||
374 | } | ||
375 | #else | ||
376 | static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino) | ||
377 | { | ||
378 | if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)) | ||
379 | ino = ceph_ino_to_ino32(ino); | ||
380 | return ino; | ||
381 | } | ||
382 | #endif | ||
383 | |||
384 | |||
343 | /* for printf-style formatting */ | 385 | /* for printf-style formatting */ |
344 | #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap | 386 | #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap |
345 | 387 | ||
@@ -428,13 +470,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) | |||
428 | return ((loff_t)frag << 32) | (loff_t)off; | 470 | return ((loff_t)frag << 32) | (loff_t)off; |
429 | } | 471 | } |
430 | 472 | ||
431 | static inline int ceph_set_ino_cb(struct inode *inode, void *data) | ||
432 | { | ||
433 | ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; | ||
434 | inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data); | ||
435 | return 0; | ||
436 | } | ||
437 | |||
438 | /* | 473 | /* |
439 | * caps helpers | 474 | * caps helpers |
440 | */ | 475 | */ |
@@ -503,15 +538,6 @@ extern void ceph_reservation_status(struct ceph_fs_client *client, | |||
503 | int *total, int *avail, int *used, | 538 | int *total, int *avail, int *used, |
504 | int *reserved, int *min); | 539 | int *reserved, int *min); |
505 | 540 | ||
506 | static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) | ||
507 | { | ||
508 | return (struct ceph_fs_client *)inode->i_sb->s_fs_info; | ||
509 | } | ||
510 | |||
511 | static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) | ||
512 | { | ||
513 | return (struct ceph_fs_client *)sb->s_fs_info; | ||
514 | } | ||
515 | 541 | ||
516 | 542 | ||
517 | /* | 543 | /* |
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 09dcc0c2ffd5..b8e995fbd867 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h | |||
@@ -136,9 +136,18 @@ struct ceph_dir_layout { | |||
136 | 136 | ||
137 | 137 | ||
138 | /* osd */ | 138 | /* osd */ |
139 | #define CEPH_MSG_OSD_MAP 41 | 139 | #define CEPH_MSG_OSD_MAP 41 |
140 | #define CEPH_MSG_OSD_OP 42 | 140 | #define CEPH_MSG_OSD_OP 42 |
141 | #define CEPH_MSG_OSD_OPREPLY 43 | 141 | #define CEPH_MSG_OSD_OPREPLY 43 |
142 | #define CEPH_MSG_WATCH_NOTIFY 44 | ||
143 | |||
144 | |||
145 | /* watch-notify operations */ | ||
146 | enum { | ||
147 | WATCH_NOTIFY = 1, /* notifying watcher */ | ||
148 | WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */ | ||
149 | }; | ||
150 | |||
142 | 151 | ||
143 | /* pool operations */ | 152 | /* pool operations */ |
144 | enum { | 153 | enum { |
@@ -213,8 +222,10 @@ struct ceph_client_mount { | |||
213 | struct ceph_mon_request_header monhdr; | 222 | struct ceph_mon_request_header monhdr; |
214 | } __attribute__ ((packed)); | 223 | } __attribute__ ((packed)); |
215 | 224 | ||
225 | #define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */ | ||
226 | |||
216 | struct ceph_mon_subscribe_item { | 227 | struct ceph_mon_subscribe_item { |
217 | __le64 have_version; __le64 have; | 228 | __le64 have_version; __le64 have; |
218 | __u8 onetime; | 229 | __u8 onetime; |
219 | } __attribute__ ((packed)); | 230 | } __attribute__ ((packed)); |
220 | 231 | ||
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 72c72bfccb88..0d2e0fffb470 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h | |||
@@ -71,7 +71,6 @@ struct ceph_options { | |||
71 | #define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ | 71 | #define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */ |
72 | #define CEPH_OSD_KEEPALIVE_DEFAULT 5 | 72 | #define CEPH_OSD_KEEPALIVE_DEFAULT 5 |
73 | #define CEPH_OSD_IDLE_TTL_DEFAULT 60 | 73 | #define CEPH_OSD_IDLE_TTL_DEFAULT 60 |
74 | #define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ | ||
75 | 74 | ||
76 | #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) | 75 | #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) |
77 | #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) | 76 | #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) |
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index a1af29648fb5..f88eacb111d4 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h | |||
@@ -32,6 +32,7 @@ struct ceph_osd { | |||
32 | struct rb_node o_node; | 32 | struct rb_node o_node; |
33 | struct ceph_connection o_con; | 33 | struct ceph_connection o_con; |
34 | struct list_head o_requests; | 34 | struct list_head o_requests; |
35 | struct list_head o_linger_requests; | ||
35 | struct list_head o_osd_lru; | 36 | struct list_head o_osd_lru; |
36 | struct ceph_authorizer *o_authorizer; | 37 | struct ceph_authorizer *o_authorizer; |
37 | void *o_authorizer_buf, *o_authorizer_reply_buf; | 38 | void *o_authorizer_buf, *o_authorizer_reply_buf; |
@@ -47,6 +48,8 @@ struct ceph_osd_request { | |||
47 | struct rb_node r_node; | 48 | struct rb_node r_node; |
48 | struct list_head r_req_lru_item; | 49 | struct list_head r_req_lru_item; |
49 | struct list_head r_osd_item; | 50 | struct list_head r_osd_item; |
51 | struct list_head r_linger_item; | ||
52 | struct list_head r_linger_osd; | ||
50 | struct ceph_osd *r_osd; | 53 | struct ceph_osd *r_osd; |
51 | struct ceph_pg r_pgid; | 54 | struct ceph_pg r_pgid; |
52 | int r_pg_osds[CEPH_PG_MAX_SIZE]; | 55 | int r_pg_osds[CEPH_PG_MAX_SIZE]; |
@@ -59,6 +62,7 @@ struct ceph_osd_request { | |||
59 | int r_flags; /* any additional flags for the osd */ | 62 | int r_flags; /* any additional flags for the osd */ |
60 | u32 r_sent; /* >0 if r_request is sending/sent */ | 63 | u32 r_sent; /* >0 if r_request is sending/sent */ |
61 | int r_got_reply; | 64 | int r_got_reply; |
65 | int r_linger; | ||
62 | 66 | ||
63 | struct ceph_osd_client *r_osdc; | 67 | struct ceph_osd_client *r_osdc; |
64 | struct kref r_kref; | 68 | struct kref r_kref; |
@@ -74,7 +78,6 @@ struct ceph_osd_request { | |||
74 | char r_oid[40]; /* object name */ | 78 | char r_oid[40]; /* object name */ |
75 | int r_oid_len; | 79 | int r_oid_len; |
76 | unsigned long r_stamp; /* send OR check time */ | 80 | unsigned long r_stamp; /* send OR check time */ |
77 | bool r_resend; /* msg send failed, needs retry */ | ||
78 | 81 | ||
79 | struct ceph_file_layout r_file_layout; | 82 | struct ceph_file_layout r_file_layout; |
80 | struct ceph_snap_context *r_snapc; /* snap context for writes */ | 83 | struct ceph_snap_context *r_snapc; /* snap context for writes */ |
@@ -90,6 +93,26 @@ struct ceph_osd_request { | |||
90 | struct ceph_pagelist *r_trail; /* trailing part of the data */ | 93 | struct ceph_pagelist *r_trail; /* trailing part of the data */ |
91 | }; | 94 | }; |
92 | 95 | ||
96 | struct ceph_osd_event { | ||
97 | u64 cookie; | ||
98 | int one_shot; | ||
99 | struct ceph_osd_client *osdc; | ||
100 | void (*cb)(u64, u64, u8, void *); | ||
101 | void *data; | ||
102 | struct rb_node node; | ||
103 | struct list_head osd_node; | ||
104 | struct kref kref; | ||
105 | struct completion completion; | ||
106 | }; | ||
107 | |||
108 | struct ceph_osd_event_work { | ||
109 | struct work_struct work; | ||
110 | struct ceph_osd_event *event; | ||
111 | u64 ver; | ||
112 | u64 notify_id; | ||
113 | u8 opcode; | ||
114 | }; | ||
115 | |||
93 | struct ceph_osd_client { | 116 | struct ceph_osd_client { |
94 | struct ceph_client *client; | 117 | struct ceph_client *client; |
95 | 118 | ||
@@ -104,7 +127,10 @@ struct ceph_osd_client { | |||
104 | u64 timeout_tid; /* tid of timeout triggering rq */ | 127 | u64 timeout_tid; /* tid of timeout triggering rq */ |
105 | u64 last_tid; /* tid of last request */ | 128 | u64 last_tid; /* tid of last request */ |
106 | struct rb_root requests; /* pending requests */ | 129 | struct rb_root requests; /* pending requests */ |
107 | struct list_head req_lru; /* pending requests lru */ | 130 | struct list_head req_lru; /* in-flight lru */ |
131 | struct list_head req_unsent; /* unsent/need-resend queue */ | ||
132 | struct list_head req_notarget; /* map to no osd */ | ||
133 | struct list_head req_linger; /* lingering requests */ | ||
108 | int num_requests; | 134 | int num_requests; |
109 | struct delayed_work timeout_work; | 135 | struct delayed_work timeout_work; |
110 | struct delayed_work osds_timeout_work; | 136 | struct delayed_work osds_timeout_work; |
@@ -116,6 +142,12 @@ struct ceph_osd_client { | |||
116 | 142 | ||
117 | struct ceph_msgpool msgpool_op; | 143 | struct ceph_msgpool msgpool_op; |
118 | struct ceph_msgpool msgpool_op_reply; | 144 | struct ceph_msgpool msgpool_op_reply; |
145 | |||
146 | spinlock_t event_lock; | ||
147 | struct rb_root event_tree; | ||
148 | u64 event_count; | ||
149 | |||
150 | struct workqueue_struct *notify_wq; | ||
119 | }; | 151 | }; |
120 | 152 | ||
121 | struct ceph_osd_req_op { | 153 | struct ceph_osd_req_op { |
@@ -150,6 +182,13 @@ struct ceph_osd_req_op { | |||
150 | struct { | 182 | struct { |
151 | u64 snapid; | 183 | u64 snapid; |
152 | } snap; | 184 | } snap; |
185 | struct { | ||
186 | u64 cookie; | ||
187 | u64 ver; | ||
188 | __u8 flag; | ||
189 | u32 prot_ver; | ||
190 | u32 timeout; | ||
191 | } watch; | ||
153 | }; | 192 | }; |
154 | u32 payload_len; | 193 | u32 payload_len; |
155 | }; | 194 | }; |
@@ -198,6 +237,11 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, | |||
198 | bool use_mempool, int num_reply, | 237 | bool use_mempool, int num_reply, |
199 | int page_align); | 238 | int page_align); |
200 | 239 | ||
240 | extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, | ||
241 | struct ceph_osd_request *req); | ||
242 | extern void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, | ||
243 | struct ceph_osd_request *req); | ||
244 | |||
201 | static inline void ceph_osdc_get_request(struct ceph_osd_request *req) | 245 | static inline void ceph_osdc_get_request(struct ceph_osd_request *req) |
202 | { | 246 | { |
203 | kref_get(&req->r_kref); | 247 | kref_get(&req->r_kref); |
@@ -233,5 +277,14 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, | |||
233 | struct page **pages, int nr_pages, | 277 | struct page **pages, int nr_pages, |
234 | int flags, int do_sync, bool nofail); | 278 | int flags, int do_sync, bool nofail); |
235 | 279 | ||
280 | /* watch/notify events */ | ||
281 | extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, | ||
282 | void (*event_cb)(u64, u64, u8, void *), | ||
283 | int one_shot, void *data, | ||
284 | struct ceph_osd_event **pevent); | ||
285 | extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); | ||
286 | extern int ceph_osdc_wait_event(struct ceph_osd_event *event, | ||
287 | unsigned long timeout); | ||
288 | extern void ceph_osdc_put_event(struct ceph_osd_event *event); | ||
236 | #endif | 289 | #endif |
237 | 290 | ||
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 6d5247f2e81b..0a99099801a4 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h | |||
@@ -12,9 +12,9 @@ | |||
12 | * osdmap encoding versions | 12 | * osdmap encoding versions |
13 | */ | 13 | */ |
14 | #define CEPH_OSDMAP_INC_VERSION 5 | 14 | #define CEPH_OSDMAP_INC_VERSION 5 |
15 | #define CEPH_OSDMAP_INC_VERSION_EXT 5 | 15 | #define CEPH_OSDMAP_INC_VERSION_EXT 6 |
16 | #define CEPH_OSDMAP_VERSION 5 | 16 | #define CEPH_OSDMAP_VERSION 5 |
17 | #define CEPH_OSDMAP_VERSION_EXT 5 | 17 | #define CEPH_OSDMAP_VERSION_EXT 6 |
18 | 18 | ||
19 | /* | 19 | /* |
20 | * fs id | 20 | * fs id |
@@ -181,9 +181,17 @@ enum { | |||
181 | /* read */ | 181 | /* read */ |
182 | CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1, | 182 | CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1, |
183 | CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2, | 183 | CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2, |
184 | CEPH_OSD_OP_MAPEXT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 3, | ||
184 | 185 | ||
185 | /* fancy read */ | 186 | /* fancy read */ |
186 | CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, | 187 | CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4, |
188 | CEPH_OSD_OP_SPARSE_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 5, | ||
189 | |||
190 | CEPH_OSD_OP_NOTIFY = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 6, | ||
191 | CEPH_OSD_OP_NOTIFY_ACK = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 7, | ||
192 | |||
193 | /* versioning */ | ||
194 | CEPH_OSD_OP_ASSERT_VER = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 8, | ||
187 | 195 | ||
188 | /* write */ | 196 | /* write */ |
189 | CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1, | 197 | CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1, |
@@ -205,6 +213,8 @@ enum { | |||
205 | CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, | 213 | CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, |
206 | CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14, | 214 | CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14, |
207 | 215 | ||
216 | CEPH_OSD_OP_WATCH = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 15, | ||
217 | |||
208 | /** attrs **/ | 218 | /** attrs **/ |
209 | /* read */ | 219 | /* read */ |
210 | CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, | 220 | CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, |
@@ -218,11 +228,14 @@ enum { | |||
218 | CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4, | 228 | CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4, |
219 | 229 | ||
220 | /** subop **/ | 230 | /** subop **/ |
221 | CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, | 231 | CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1, |
222 | CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, | 232 | CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2, |
223 | CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, | 233 | CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3, |
224 | CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, | 234 | CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4, |
225 | CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, | 235 | CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5, |
236 | CEPH_OSD_OP_SCRUB_RESERVE = CEPH_OSD_OP_MODE_SUB | 6, | ||
237 | CEPH_OSD_OP_SCRUB_UNRESERVE = CEPH_OSD_OP_MODE_SUB | 7, | ||
238 | CEPH_OSD_OP_SCRUB_STOP = CEPH_OSD_OP_MODE_SUB | 8, | ||
226 | 239 | ||
227 | /** lock **/ | 240 | /** lock **/ |
228 | CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, | 241 | CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1, |
@@ -328,6 +341,8 @@ enum { | |||
328 | CEPH_OSD_CMPXATTR_MODE_U64 = 2 | 341 | CEPH_OSD_CMPXATTR_MODE_U64 = 2 |
329 | }; | 342 | }; |
330 | 343 | ||
344 | #define RADOS_NOTIFY_VER 1 | ||
345 | |||
331 | /* | 346 | /* |
332 | * an individual object operation. each may be accompanied by some data | 347 | * an individual object operation. each may be accompanied by some data |
333 | * payload | 348 | * payload |
@@ -359,7 +374,12 @@ struct ceph_osd_op { | |||
359 | struct { | 374 | struct { |
360 | __le64 snapid; | 375 | __le64 snapid; |
361 | } __attribute__ ((packed)) snap; | 376 | } __attribute__ ((packed)) snap; |
362 | }; | 377 | struct { |
378 | __le64 cookie; | ||
379 | __le64 ver; | ||
380 | __u8 flag; /* 0 = unwatch, 1 = watch */ | ||
381 | } __attribute__ ((packed)) watch; | ||
382 | }; | ||
363 | __le32 payload_len; | 383 | __le32 payload_len; |
364 | } __attribute__ ((packed)); | 384 | } __attribute__ ((packed)); |
365 | 385 | ||
@@ -402,4 +422,5 @@ struct ceph_osd_reply_head { | |||
402 | } __attribute__ ((packed)); | 422 | } __attribute__ ((packed)); |
403 | 423 | ||
404 | 424 | ||
425 | |||
405 | #endif | 426 | #endif |
diff --git a/net/ceph/armor.c b/net/ceph/armor.c index eb2a666b0be7..1fc1ee11dfa2 100644 --- a/net/ceph/armor.c +++ b/net/ceph/armor.c | |||
@@ -78,8 +78,10 @@ int ceph_unarmor(char *dst, const char *src, const char *end) | |||
78 | while (src < end) { | 78 | while (src < end) { |
79 | int a, b, c, d; | 79 | int a, b, c, d; |
80 | 80 | ||
81 | if (src < end && src[0] == '\n') | 81 | if (src[0] == '\n') { |
82 | src++; | 82 | src++; |
83 | continue; | ||
84 | } | ||
83 | if (src + 4 > end) | 85 | if (src + 4 > end) |
84 | return -EINVAL; | 86 | return -EINVAL; |
85 | a = decode_bits(src[0]); | 87 | a = decode_bits(src[0]); |
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index f3e4a13fea0c..95f96ab94bba 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c | |||
@@ -62,6 +62,7 @@ const char *ceph_msg_type_name(int type) | |||
62 | case CEPH_MSG_OSD_MAP: return "osd_map"; | 62 | case CEPH_MSG_OSD_MAP: return "osd_map"; |
63 | case CEPH_MSG_OSD_OP: return "osd_op"; | 63 | case CEPH_MSG_OSD_OP: return "osd_op"; |
64 | case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; | 64 | case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; |
65 | case CEPH_MSG_WATCH_NOTIFY: return "watch_notify"; | ||
65 | default: return "unknown"; | 66 | default: return "unknown"; |
66 | } | 67 | } |
67 | } | 68 | } |
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3e20a122ffa2..02212ed50852 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c | |||
@@ -22,10 +22,15 @@ | |||
22 | #define OSD_OPREPLY_FRONT_LEN 512 | 22 | #define OSD_OPREPLY_FRONT_LEN 512 |
23 | 23 | ||
24 | static const struct ceph_connection_operations osd_con_ops; | 24 | static const struct ceph_connection_operations osd_con_ops; |
25 | static int __kick_requests(struct ceph_osd_client *osdc, | ||
26 | struct ceph_osd *kickosd); | ||
27 | 25 | ||
28 | static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); | 26 | static void send_queued(struct ceph_osd_client *osdc); |
27 | static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); | ||
28 | static void __register_request(struct ceph_osd_client *osdc, | ||
29 | struct ceph_osd_request *req); | ||
30 | static void __unregister_linger_request(struct ceph_osd_client *osdc, | ||
31 | struct ceph_osd_request *req); | ||
32 | static int __send_request(struct ceph_osd_client *osdc, | ||
33 | struct ceph_osd_request *req); | ||
29 | 34 | ||
30 | static int op_needs_trail(int op) | 35 | static int op_needs_trail(int op) |
31 | { | 36 | { |
@@ -34,6 +39,7 @@ static int op_needs_trail(int op) | |||
34 | case CEPH_OSD_OP_SETXATTR: | 39 | case CEPH_OSD_OP_SETXATTR: |
35 | case CEPH_OSD_OP_CMPXATTR: | 40 | case CEPH_OSD_OP_CMPXATTR: |
36 | case CEPH_OSD_OP_CALL: | 41 | case CEPH_OSD_OP_CALL: |
42 | case CEPH_OSD_OP_NOTIFY: | ||
37 | return 1; | 43 | return 1; |
38 | default: | 44 | default: |
39 | return 0; | 45 | return 0; |
@@ -209,6 +215,8 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | |||
209 | init_completion(&req->r_completion); | 215 | init_completion(&req->r_completion); |
210 | init_completion(&req->r_safe_completion); | 216 | init_completion(&req->r_safe_completion); |
211 | INIT_LIST_HEAD(&req->r_unsafe_item); | 217 | INIT_LIST_HEAD(&req->r_unsafe_item); |
218 | INIT_LIST_HEAD(&req->r_linger_item); | ||
219 | INIT_LIST_HEAD(&req->r_linger_osd); | ||
212 | req->r_flags = flags; | 220 | req->r_flags = flags; |
213 | 221 | ||
214 | WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); | 222 | WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); |
@@ -315,6 +323,24 @@ static void osd_req_encode_op(struct ceph_osd_request *req, | |||
315 | break; | 323 | break; |
316 | case CEPH_OSD_OP_STARTSYNC: | 324 | case CEPH_OSD_OP_STARTSYNC: |
317 | break; | 325 | break; |
326 | case CEPH_OSD_OP_NOTIFY: | ||
327 | { | ||
328 | __le32 prot_ver = cpu_to_le32(src->watch.prot_ver); | ||
329 | __le32 timeout = cpu_to_le32(src->watch.timeout); | ||
330 | |||
331 | BUG_ON(!req->r_trail); | ||
332 | |||
333 | ceph_pagelist_append(req->r_trail, | ||
334 | &prot_ver, sizeof(prot_ver)); | ||
335 | ceph_pagelist_append(req->r_trail, | ||
336 | &timeout, sizeof(timeout)); | ||
337 | } | ||
338 | case CEPH_OSD_OP_NOTIFY_ACK: | ||
339 | case CEPH_OSD_OP_WATCH: | ||
340 | dst->watch.cookie = cpu_to_le64(src->watch.cookie); | ||
341 | dst->watch.ver = cpu_to_le64(src->watch.ver); | ||
342 | dst->watch.flag = src->watch.flag; | ||
343 | break; | ||
318 | default: | 344 | default: |
319 | pr_err("unrecognized osd opcode %d\n", dst->op); | 345 | pr_err("unrecognized osd opcode %d\n", dst->op); |
320 | WARN_ON(1); | 346 | WARN_ON(1); |
@@ -529,6 +555,45 @@ __lookup_request_ge(struct ceph_osd_client *osdc, | |||
529 | return NULL; | 555 | return NULL; |
530 | } | 556 | } |
531 | 557 | ||
558 | /* | ||
559 | * Resubmit requests pending on the given osd. | ||
560 | */ | ||
561 | static void __kick_osd_requests(struct ceph_osd_client *osdc, | ||
562 | struct ceph_osd *osd) | ||
563 | { | ||
564 | struct ceph_osd_request *req, *nreq; | ||
565 | int err; | ||
566 | |||
567 | dout("__kick_osd_requests osd%d\n", osd->o_osd); | ||
568 | err = __reset_osd(osdc, osd); | ||
569 | if (err == -EAGAIN) | ||
570 | return; | ||
571 | |||
572 | list_for_each_entry(req, &osd->o_requests, r_osd_item) { | ||
573 | list_move(&req->r_req_lru_item, &osdc->req_unsent); | ||
574 | dout("requeued %p tid %llu osd%d\n", req, req->r_tid, | ||
575 | osd->o_osd); | ||
576 | if (!req->r_linger) | ||
577 | req->r_flags |= CEPH_OSD_FLAG_RETRY; | ||
578 | } | ||
579 | |||
580 | list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, | ||
581 | r_linger_osd) { | ||
582 | __unregister_linger_request(osdc, req); | ||
583 | __register_request(osdc, req); | ||
584 | list_move(&req->r_req_lru_item, &osdc->req_unsent); | ||
585 | dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, | ||
586 | osd->o_osd); | ||
587 | } | ||
588 | } | ||
589 | |||
590 | static void kick_osd_requests(struct ceph_osd_client *osdc, | ||
591 | struct ceph_osd *kickosd) | ||
592 | { | ||
593 | mutex_lock(&osdc->request_mutex); | ||
594 | __kick_osd_requests(osdc, kickosd); | ||
595 | mutex_unlock(&osdc->request_mutex); | ||
596 | } | ||
532 | 597 | ||
533 | /* | 598 | /* |
534 | * If the osd connection drops, we need to resubmit all requests. | 599 | * If the osd connection drops, we need to resubmit all requests. |
@@ -543,7 +608,8 @@ static void osd_reset(struct ceph_connection *con) | |||
543 | dout("osd_reset osd%d\n", osd->o_osd); | 608 | dout("osd_reset osd%d\n", osd->o_osd); |
544 | osdc = osd->o_osdc; | 609 | osdc = osd->o_osdc; |
545 | down_read(&osdc->map_sem); | 610 | down_read(&osdc->map_sem); |
546 | kick_requests(osdc, osd); | 611 | kick_osd_requests(osdc, osd); |
612 | send_queued(osdc); | ||
547 | up_read(&osdc->map_sem); | 613 | up_read(&osdc->map_sem); |
548 | } | 614 | } |
549 | 615 | ||
@@ -561,6 +627,7 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) | |||
561 | atomic_set(&osd->o_ref, 1); | 627 | atomic_set(&osd->o_ref, 1); |
562 | osd->o_osdc = osdc; | 628 | osd->o_osdc = osdc; |
563 | INIT_LIST_HEAD(&osd->o_requests); | 629 | INIT_LIST_HEAD(&osd->o_requests); |
630 | INIT_LIST_HEAD(&osd->o_linger_requests); | ||
564 | INIT_LIST_HEAD(&osd->o_osd_lru); | 631 | INIT_LIST_HEAD(&osd->o_osd_lru); |
565 | osd->o_incarnation = 1; | 632 | osd->o_incarnation = 1; |
566 | 633 | ||
@@ -650,7 +717,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) | |||
650 | int ret = 0; | 717 | int ret = 0; |
651 | 718 | ||
652 | dout("__reset_osd %p osd%d\n", osd, osd->o_osd); | 719 | dout("__reset_osd %p osd%d\n", osd, osd->o_osd); |
653 | if (list_empty(&osd->o_requests)) { | 720 | if (list_empty(&osd->o_requests) && |
721 | list_empty(&osd->o_linger_requests)) { | ||
654 | __remove_osd(osdc, osd); | 722 | __remove_osd(osdc, osd); |
655 | } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], | 723 | } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], |
656 | &osd->o_con.peer_addr, | 724 | &osd->o_con.peer_addr, |
@@ -723,10 +791,9 @@ static void __cancel_osd_timeout(struct ceph_osd_client *osdc) | |||
723 | * Register request, assign tid. If this is the first request, set up | 791 | * Register request, assign tid. If this is the first request, set up |
724 | * the timeout event. | 792 | * the timeout event. |
725 | */ | 793 | */ |
726 | static void register_request(struct ceph_osd_client *osdc, | 794 | static void __register_request(struct ceph_osd_client *osdc, |
727 | struct ceph_osd_request *req) | 795 | struct ceph_osd_request *req) |
728 | { | 796 | { |
729 | mutex_lock(&osdc->request_mutex); | ||
730 | req->r_tid = ++osdc->last_tid; | 797 | req->r_tid = ++osdc->last_tid; |
731 | req->r_request->hdr.tid = cpu_to_le64(req->r_tid); | 798 | req->r_request->hdr.tid = cpu_to_le64(req->r_tid); |
732 | INIT_LIST_HEAD(&req->r_req_lru_item); | 799 | INIT_LIST_HEAD(&req->r_req_lru_item); |
@@ -740,6 +807,13 @@ static void register_request(struct ceph_osd_client *osdc, | |||
740 | dout(" first request, scheduling timeout\n"); | 807 | dout(" first request, scheduling timeout\n"); |
741 | __schedule_osd_timeout(osdc); | 808 | __schedule_osd_timeout(osdc); |
742 | } | 809 | } |
810 | } | ||
811 | |||
812 | static void register_request(struct ceph_osd_client *osdc, | ||
813 | struct ceph_osd_request *req) | ||
814 | { | ||
815 | mutex_lock(&osdc->request_mutex); | ||
816 | __register_request(osdc, req); | ||
743 | mutex_unlock(&osdc->request_mutex); | 817 | mutex_unlock(&osdc->request_mutex); |
744 | } | 818 | } |
745 | 819 | ||
@@ -758,9 +832,14 @@ static void __unregister_request(struct ceph_osd_client *osdc, | |||
758 | ceph_con_revoke(&req->r_osd->o_con, req->r_request); | 832 | ceph_con_revoke(&req->r_osd->o_con, req->r_request); |
759 | 833 | ||
760 | list_del_init(&req->r_osd_item); | 834 | list_del_init(&req->r_osd_item); |
761 | if (list_empty(&req->r_osd->o_requests)) | 835 | if (list_empty(&req->r_osd->o_requests) && |
836 | list_empty(&req->r_osd->o_linger_requests)) { | ||
837 | dout("moving osd to %p lru\n", req->r_osd); | ||
762 | __move_osd_to_lru(osdc, req->r_osd); | 838 | __move_osd_to_lru(osdc, req->r_osd); |
763 | req->r_osd = NULL; | 839 | } |
840 | if (list_empty(&req->r_osd_item) && | ||
841 | list_empty(&req->r_linger_item)) | ||
842 | req->r_osd = NULL; | ||
764 | } | 843 | } |
765 | 844 | ||
766 | ceph_osdc_put_request(req); | 845 | ceph_osdc_put_request(req); |
@@ -781,20 +860,72 @@ static void __cancel_request(struct ceph_osd_request *req) | |||
781 | ceph_con_revoke(&req->r_osd->o_con, req->r_request); | 860 | ceph_con_revoke(&req->r_osd->o_con, req->r_request); |
782 | req->r_sent = 0; | 861 | req->r_sent = 0; |
783 | } | 862 | } |
784 | list_del_init(&req->r_req_lru_item); | ||
785 | } | 863 | } |
786 | 864 | ||
865 | static void __register_linger_request(struct ceph_osd_client *osdc, | ||
866 | struct ceph_osd_request *req) | ||
867 | { | ||
868 | dout("__register_linger_request %p\n", req); | ||
869 | list_add_tail(&req->r_linger_item, &osdc->req_linger); | ||
870 | list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests); | ||
871 | } | ||
872 | |||
873 | static void __unregister_linger_request(struct ceph_osd_client *osdc, | ||
874 | struct ceph_osd_request *req) | ||
875 | { | ||
876 | dout("__unregister_linger_request %p\n", req); | ||
877 | if (req->r_osd) { | ||
878 | list_del_init(&req->r_linger_item); | ||
879 | list_del_init(&req->r_linger_osd); | ||
880 | |||
881 | if (list_empty(&req->r_osd->o_requests) && | ||
882 | list_empty(&req->r_osd->o_linger_requests)) { | ||
883 | dout("moving osd to %p lru\n", req->r_osd); | ||
884 | __move_osd_to_lru(osdc, req->r_osd); | ||
885 | } | ||
886 | req->r_osd = NULL; | ||
887 | } | ||
888 | } | ||
889 | |||
890 | void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, | ||
891 | struct ceph_osd_request *req) | ||
892 | { | ||
893 | mutex_lock(&osdc->request_mutex); | ||
894 | if (req->r_linger) { | ||
895 | __unregister_linger_request(osdc, req); | ||
896 | ceph_osdc_put_request(req); | ||
897 | } | ||
898 | mutex_unlock(&osdc->request_mutex); | ||
899 | } | ||
900 | EXPORT_SYMBOL(ceph_osdc_unregister_linger_request); | ||
901 | |||
902 | void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, | ||
903 | struct ceph_osd_request *req) | ||
904 | { | ||
905 | if (!req->r_linger) { | ||
906 | dout("set_request_linger %p\n", req); | ||
907 | req->r_linger = 1; | ||
908 | /* | ||
909 | * caller is now responsible for calling | ||
910 | * unregister_linger_request | ||
911 | */ | ||
912 | ceph_osdc_get_request(req); | ||
913 | } | ||
914 | } | ||
915 | EXPORT_SYMBOL(ceph_osdc_set_request_linger); | ||
916 | |||
787 | /* | 917 | /* |
788 | * Pick an osd (the first 'up' osd in the pg), allocate the osd struct | 918 | * Pick an osd (the first 'up' osd in the pg), allocate the osd struct |
789 | * (as needed), and set the request r_osd appropriately. If there is | 919 | * (as needed), and set the request r_osd appropriately. If there is |
790 | * no up osd, set r_osd to NULL. | 920 | * no up osd, set r_osd to NULL. Move the request to the appropiate list |
921 | * (unsent, homeless) or leave on in-flight lru. | ||
791 | * | 922 | * |
792 | * Return 0 if unchanged, 1 if changed, or negative on error. | 923 | * Return 0 if unchanged, 1 if changed, or negative on error. |
793 | * | 924 | * |
794 | * Caller should hold map_sem for read and request_mutex. | 925 | * Caller should hold map_sem for read and request_mutex. |
795 | */ | 926 | */ |
796 | static int __map_osds(struct ceph_osd_client *osdc, | 927 | static int __map_request(struct ceph_osd_client *osdc, |
797 | struct ceph_osd_request *req) | 928 | struct ceph_osd_request *req) |
798 | { | 929 | { |
799 | struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; | 930 | struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; |
800 | struct ceph_pg pgid; | 931 | struct ceph_pg pgid; |
@@ -802,11 +933,13 @@ static int __map_osds(struct ceph_osd_client *osdc, | |||
802 | int o = -1, num = 0; | 933 | int o = -1, num = 0; |
803 | int err; | 934 | int err; |
804 | 935 | ||
805 | dout("map_osds %p tid %lld\n", req, req->r_tid); | 936 | dout("map_request %p tid %lld\n", req, req->r_tid); |
806 | err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, | 937 | err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, |
807 | &req->r_file_layout, osdc->osdmap); | 938 | &req->r_file_layout, osdc->osdmap); |
808 | if (err) | 939 | if (err) { |
940 | list_move(&req->r_req_lru_item, &osdc->req_notarget); | ||
809 | return err; | 941 | return err; |
942 | } | ||
810 | pgid = reqhead->layout.ol_pgid; | 943 | pgid = reqhead->layout.ol_pgid; |
811 | req->r_pgid = pgid; | 944 | req->r_pgid = pgid; |
812 | 945 | ||
@@ -823,7 +956,7 @@ static int __map_osds(struct ceph_osd_client *osdc, | |||
823 | (req->r_osd == NULL && o == -1)) | 956 | (req->r_osd == NULL && o == -1)) |
824 | return 0; /* no change */ | 957 | return 0; /* no change */ |
825 | 958 | ||
826 | dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n", | 959 | dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n", |
827 | req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, | 960 | req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, |
828 | req->r_osd ? req->r_osd->o_osd : -1); | 961 | req->r_osd ? req->r_osd->o_osd : -1); |
829 | 962 | ||
@@ -841,10 +974,12 @@ static int __map_osds(struct ceph_osd_client *osdc, | |||
841 | if (!req->r_osd && o >= 0) { | 974 | if (!req->r_osd && o >= 0) { |
842 | err = -ENOMEM; | 975 | err = -ENOMEM; |
843 | req->r_osd = create_osd(osdc); | 976 | req->r_osd = create_osd(osdc); |
844 | if (!req->r_osd) | 977 | if (!req->r_osd) { |
978 | list_move(&req->r_req_lru_item, &osdc->req_notarget); | ||
845 | goto out; | 979 | goto out; |
980 | } | ||
846 | 981 | ||
847 | dout("map_osds osd %p is osd%d\n", req->r_osd, o); | 982 | dout("map_request osd %p is osd%d\n", req->r_osd, o); |
848 | req->r_osd->o_osd = o; | 983 | req->r_osd->o_osd = o; |
849 | req->r_osd->o_con.peer_name.num = cpu_to_le64(o); | 984 | req->r_osd->o_con.peer_name.num = cpu_to_le64(o); |
850 | __insert_osd(osdc, req->r_osd); | 985 | __insert_osd(osdc, req->r_osd); |
@@ -855,6 +990,9 @@ static int __map_osds(struct ceph_osd_client *osdc, | |||
855 | if (req->r_osd) { | 990 | if (req->r_osd) { |
856 | __remove_osd_from_lru(req->r_osd); | 991 | __remove_osd_from_lru(req->r_osd); |
857 | list_add(&req->r_osd_item, &req->r_osd->o_requests); | 992 | list_add(&req->r_osd_item, &req->r_osd->o_requests); |
993 | list_move(&req->r_req_lru_item, &osdc->req_unsent); | ||
994 | } else { | ||
995 | list_move(&req->r_req_lru_item, &osdc->req_notarget); | ||
858 | } | 996 | } |
859 | err = 1; /* osd or pg changed */ | 997 | err = 1; /* osd or pg changed */ |
860 | 998 | ||
@@ -869,16 +1007,6 @@ static int __send_request(struct ceph_osd_client *osdc, | |||
869 | struct ceph_osd_request *req) | 1007 | struct ceph_osd_request *req) |
870 | { | 1008 | { |
871 | struct ceph_osd_request_head *reqhead; | 1009 | struct ceph_osd_request_head *reqhead; |
872 | int err; | ||
873 | |||
874 | err = __map_osds(osdc, req); | ||
875 | if (err < 0) | ||
876 | return err; | ||
877 | if (req->r_osd == NULL) { | ||
878 | dout("send_request %p no up osds in pg\n", req); | ||
879 | ceph_monc_request_next_osdmap(&osdc->client->monc); | ||
880 | return 0; | ||
881 | } | ||
882 | 1010 | ||
883 | dout("send_request %p tid %llu to osd%d flags %d\n", | 1011 | dout("send_request %p tid %llu to osd%d flags %d\n", |
884 | req, req->r_tid, req->r_osd->o_osd, req->r_flags); | 1012 | req, req->r_tid, req->r_osd->o_osd, req->r_flags); |
@@ -898,6 +1026,21 @@ static int __send_request(struct ceph_osd_client *osdc, | |||
898 | } | 1026 | } |
899 | 1027 | ||
900 | /* | 1028 | /* |
1029 | * Send any requests in the queue (req_unsent). | ||
1030 | */ | ||
1031 | static void send_queued(struct ceph_osd_client *osdc) | ||
1032 | { | ||
1033 | struct ceph_osd_request *req, *tmp; | ||
1034 | |||
1035 | dout("send_queued\n"); | ||
1036 | mutex_lock(&osdc->request_mutex); | ||
1037 | list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) { | ||
1038 | __send_request(osdc, req); | ||
1039 | } | ||
1040 | mutex_unlock(&osdc->request_mutex); | ||
1041 | } | ||
1042 | |||
1043 | /* | ||
901 | * Timeout callback, called every N seconds when 1 or more osd | 1044 | * Timeout callback, called every N seconds when 1 or more osd |
902 | * requests has been active for more than N seconds. When this | 1045 | * requests has been active for more than N seconds. When this |
903 | * happens, we ping all OSDs with requests who have timed out to | 1046 | * happens, we ping all OSDs with requests who have timed out to |
@@ -916,30 +1059,13 @@ static void handle_timeout(struct work_struct *work) | |||
916 | unsigned long keepalive = | 1059 | unsigned long keepalive = |
917 | osdc->client->options->osd_keepalive_timeout * HZ; | 1060 | osdc->client->options->osd_keepalive_timeout * HZ; |
918 | unsigned long last_stamp = 0; | 1061 | unsigned long last_stamp = 0; |
919 | struct rb_node *p; | ||
920 | struct list_head slow_osds; | 1062 | struct list_head slow_osds; |
921 | |||
922 | dout("timeout\n"); | 1063 | dout("timeout\n"); |
923 | down_read(&osdc->map_sem); | 1064 | down_read(&osdc->map_sem); |
924 | 1065 | ||
925 | ceph_monc_request_next_osdmap(&osdc->client->monc); | 1066 | ceph_monc_request_next_osdmap(&osdc->client->monc); |
926 | 1067 | ||
927 | mutex_lock(&osdc->request_mutex); | 1068 | mutex_lock(&osdc->request_mutex); |
928 | for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { | ||
929 | req = rb_entry(p, struct ceph_osd_request, r_node); | ||
930 | |||
931 | if (req->r_resend) { | ||
932 | int err; | ||
933 | |||
934 | dout("osdc resending prev failed %lld\n", req->r_tid); | ||
935 | err = __send_request(osdc, req); | ||
936 | if (err) | ||
937 | dout("osdc failed again on %lld\n", req->r_tid); | ||
938 | else | ||
939 | req->r_resend = false; | ||
940 | continue; | ||
941 | } | ||
942 | } | ||
943 | 1069 | ||
944 | /* | 1070 | /* |
945 | * reset osds that appear to be _really_ unresponsive. this | 1071 | * reset osds that appear to be _really_ unresponsive. this |
@@ -963,7 +1089,7 @@ static void handle_timeout(struct work_struct *work) | |||
963 | BUG_ON(!osd); | 1089 | BUG_ON(!osd); |
964 | pr_warning(" tid %llu timed out on osd%d, will reset osd\n", | 1090 | pr_warning(" tid %llu timed out on osd%d, will reset osd\n", |
965 | req->r_tid, osd->o_osd); | 1091 | req->r_tid, osd->o_osd); |
966 | __kick_requests(osdc, osd); | 1092 | __kick_osd_requests(osdc, osd); |
967 | } | 1093 | } |
968 | 1094 | ||
969 | /* | 1095 | /* |
@@ -991,7 +1117,7 @@ static void handle_timeout(struct work_struct *work) | |||
991 | 1117 | ||
992 | __schedule_osd_timeout(osdc); | 1118 | __schedule_osd_timeout(osdc); |
993 | mutex_unlock(&osdc->request_mutex); | 1119 | mutex_unlock(&osdc->request_mutex); |
994 | 1120 | send_queued(osdc); | |
995 | up_read(&osdc->map_sem); | 1121 | up_read(&osdc->map_sem); |
996 | } | 1122 | } |
997 | 1123 | ||
@@ -1035,7 +1161,6 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, | |||
1035 | numops * sizeof(struct ceph_osd_op)) | 1161 | numops * sizeof(struct ceph_osd_op)) |
1036 | goto bad; | 1162 | goto bad; |
1037 | dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); | 1163 | dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); |
1038 | |||
1039 | /* lookup */ | 1164 | /* lookup */ |
1040 | mutex_lock(&osdc->request_mutex); | 1165 | mutex_lock(&osdc->request_mutex); |
1041 | req = __lookup_request(osdc, tid); | 1166 | req = __lookup_request(osdc, tid); |
@@ -1079,6 +1204,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, | |||
1079 | 1204 | ||
1080 | dout("handle_reply tid %llu flags %d\n", tid, flags); | 1205 | dout("handle_reply tid %llu flags %d\n", tid, flags); |
1081 | 1206 | ||
1207 | if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) | ||
1208 | __register_linger_request(osdc, req); | ||
1209 | |||
1082 | /* either this is a read, or we got the safe response */ | 1210 | /* either this is a read, or we got the safe response */ |
1083 | if (result < 0 || | 1211 | if (result < 0 || |
1084 | (flags & CEPH_OSD_FLAG_ONDISK) || | 1212 | (flags & CEPH_OSD_FLAG_ONDISK) || |
@@ -1099,6 +1227,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, | |||
1099 | } | 1227 | } |
1100 | 1228 | ||
1101 | done: | 1229 | done: |
1230 | dout("req=%p req->r_linger=%d\n", req, req->r_linger); | ||
1102 | ceph_osdc_put_request(req); | 1231 | ceph_osdc_put_request(req); |
1103 | return; | 1232 | return; |
1104 | 1233 | ||
@@ -1109,108 +1238,83 @@ bad: | |||
1109 | ceph_msg_dump(msg); | 1238 | ceph_msg_dump(msg); |
1110 | } | 1239 | } |
1111 | 1240 | ||
1112 | 1241 | static void reset_changed_osds(struct ceph_osd_client *osdc) | |
1113 | static int __kick_requests(struct ceph_osd_client *osdc, | ||
1114 | struct ceph_osd *kickosd) | ||
1115 | { | 1242 | { |
1116 | struct ceph_osd_request *req; | ||
1117 | struct rb_node *p, *n; | 1243 | struct rb_node *p, *n; |
1118 | int needmap = 0; | ||
1119 | int err; | ||
1120 | 1244 | ||
1121 | dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1); | 1245 | for (p = rb_first(&osdc->osds); p; p = n) { |
1122 | if (kickosd) { | 1246 | struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); |
1123 | err = __reset_osd(osdc, kickosd); | 1247 | |
1124 | if (err == -EAGAIN) | 1248 | n = rb_next(p); |
1125 | return 1; | 1249 | if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || |
1126 | } else { | 1250 | memcmp(&osd->o_con.peer_addr, |
1127 | for (p = rb_first(&osdc->osds); p; p = n) { | 1251 | ceph_osd_addr(osdc->osdmap, |
1128 | struct ceph_osd *osd = | 1252 | osd->o_osd), |
1129 | rb_entry(p, struct ceph_osd, o_node); | 1253 | sizeof(struct ceph_entity_addr)) != 0) |
1130 | 1254 | __reset_osd(osdc, osd); | |
1131 | n = rb_next(p); | ||
1132 | if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || | ||
1133 | memcmp(&osd->o_con.peer_addr, | ||
1134 | ceph_osd_addr(osdc->osdmap, | ||
1135 | osd->o_osd), | ||
1136 | sizeof(struct ceph_entity_addr)) != 0) | ||
1137 | __reset_osd(osdc, osd); | ||
1138 | } | ||
1139 | } | 1255 | } |
1256 | } | ||
1257 | |||
1258 | /* | ||
1259 | * Requeue requests whose mapping to an OSD has changed. If requests map to | ||
1260 | * no osd, request a new map. | ||
1261 | * | ||
1262 | * Caller should hold map_sem for read and request_mutex. | ||
1263 | */ | ||
1264 | static void kick_requests(struct ceph_osd_client *osdc) | ||
1265 | { | ||
1266 | struct ceph_osd_request *req, *nreq; | ||
1267 | struct rb_node *p; | ||
1268 | int needmap = 0; | ||
1269 | int err; | ||
1140 | 1270 | ||
1271 | dout("kick_requests\n"); | ||
1272 | mutex_lock(&osdc->request_mutex); | ||
1141 | for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { | 1273 | for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { |
1142 | req = rb_entry(p, struct ceph_osd_request, r_node); | 1274 | req = rb_entry(p, struct ceph_osd_request, r_node); |
1143 | 1275 | err = __map_request(osdc, req); | |
1144 | if (req->r_resend) { | 1276 | if (err < 0) |
1145 | dout(" r_resend set on tid %llu\n", req->r_tid); | 1277 | continue; /* error */ |
1146 | __cancel_request(req); | 1278 | if (req->r_osd == NULL) { |
1147 | goto kick; | 1279 | dout("%p tid %llu maps to no osd\n", req, req->r_tid); |
1148 | } | 1280 | needmap++; /* request a newer map */ |
1149 | if (req->r_osd && kickosd == req->r_osd) { | 1281 | } else if (err > 0) { |
1150 | __cancel_request(req); | 1282 | dout("%p tid %llu requeued on osd%d\n", req, req->r_tid, |
1151 | goto kick; | 1283 | req->r_osd ? req->r_osd->o_osd : -1); |
1284 | if (!req->r_linger) | ||
1285 | req->r_flags |= CEPH_OSD_FLAG_RETRY; | ||
1152 | } | 1286 | } |
1287 | } | ||
1288 | |||
1289 | list_for_each_entry_safe(req, nreq, &osdc->req_linger, | ||
1290 | r_linger_item) { | ||
1291 | dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); | ||
1153 | 1292 | ||
1154 | err = __map_osds(osdc, req); | 1293 | err = __map_request(osdc, req); |
1155 | if (err == 0) | 1294 | if (err == 0) |
1156 | continue; /* no change */ | 1295 | continue; /* no change and no osd was specified */ |
1157 | if (err < 0) { | 1296 | if (err < 0) |
1158 | /* | 1297 | continue; /* hrm! */ |
1159 | * FIXME: really, we should set the request | ||
1160 | * error and fail if this isn't a 'nofail' | ||
1161 | * request, but that's a fair bit more | ||
1162 | * complicated to do. So retry! | ||
1163 | */ | ||
1164 | dout(" setting r_resend on %llu\n", req->r_tid); | ||
1165 | req->r_resend = true; | ||
1166 | continue; | ||
1167 | } | ||
1168 | if (req->r_osd == NULL) { | 1298 | if (req->r_osd == NULL) { |
1169 | dout("tid %llu maps to no valid osd\n", req->r_tid); | 1299 | dout("tid %llu maps to no valid osd\n", req->r_tid); |
1170 | needmap++; /* request a newer map */ | 1300 | needmap++; /* request a newer map */ |
1171 | continue; | 1301 | continue; |
1172 | } | 1302 | } |
1173 | 1303 | ||
1174 | kick: | 1304 | dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, |
1175 | dout("kicking %p tid %llu osd%d\n", req, req->r_tid, | ||
1176 | req->r_osd ? req->r_osd->o_osd : -1); | 1305 | req->r_osd ? req->r_osd->o_osd : -1); |
1177 | req->r_flags |= CEPH_OSD_FLAG_RETRY; | 1306 | __unregister_linger_request(osdc, req); |
1178 | err = __send_request(osdc, req); | 1307 | __register_request(osdc, req); |
1179 | if (err) { | ||
1180 | dout(" setting r_resend on %llu\n", req->r_tid); | ||
1181 | req->r_resend = true; | ||
1182 | } | ||
1183 | } | 1308 | } |
1184 | |||
1185 | return needmap; | ||
1186 | } | ||
1187 | |||
1188 | /* | ||
1189 | * Resubmit osd requests whose osd or osd address has changed. Request | ||
1190 | * a new osd map if osds are down, or we are otherwise unable to determine | ||
1191 | * how to direct a request. | ||
1192 | * | ||
1193 | * Close connections to down osds. | ||
1194 | * | ||
1195 | * If @who is specified, resubmit requests for that specific osd. | ||
1196 | * | ||
1197 | * Caller should hold map_sem for read and request_mutex. | ||
1198 | */ | ||
1199 | static void kick_requests(struct ceph_osd_client *osdc, | ||
1200 | struct ceph_osd *kickosd) | ||
1201 | { | ||
1202 | int needmap; | ||
1203 | |||
1204 | mutex_lock(&osdc->request_mutex); | ||
1205 | needmap = __kick_requests(osdc, kickosd); | ||
1206 | mutex_unlock(&osdc->request_mutex); | 1309 | mutex_unlock(&osdc->request_mutex); |
1207 | 1310 | ||
1208 | if (needmap) { | 1311 | if (needmap) { |
1209 | dout("%d requests for down osds, need new map\n", needmap); | 1312 | dout("%d requests for down osds, need new map\n", needmap); |
1210 | ceph_monc_request_next_osdmap(&osdc->client->monc); | 1313 | ceph_monc_request_next_osdmap(&osdc->client->monc); |
1211 | } | 1314 | } |
1212 | |||
1213 | } | 1315 | } |
1316 | |||
1317 | |||
1214 | /* | 1318 | /* |
1215 | * Process updated osd map. | 1319 | * Process updated osd map. |
1216 | * | 1320 | * |
@@ -1263,6 +1367,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) | |||
1263 | ceph_osdmap_destroy(osdc->osdmap); | 1367 | ceph_osdmap_destroy(osdc->osdmap); |
1264 | osdc->osdmap = newmap; | 1368 | osdc->osdmap = newmap; |
1265 | } | 1369 | } |
1370 | kick_requests(osdc); | ||
1371 | reset_changed_osds(osdc); | ||
1266 | } else { | 1372 | } else { |
1267 | dout("ignoring incremental map %u len %d\n", | 1373 | dout("ignoring incremental map %u len %d\n", |
1268 | epoch, maplen); | 1374 | epoch, maplen); |
@@ -1300,6 +1406,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) | |||
1300 | osdc->osdmap = newmap; | 1406 | osdc->osdmap = newmap; |
1301 | if (oldmap) | 1407 | if (oldmap) |
1302 | ceph_osdmap_destroy(oldmap); | 1408 | ceph_osdmap_destroy(oldmap); |
1409 | kick_requests(osdc); | ||
1303 | } | 1410 | } |
1304 | p += maplen; | 1411 | p += maplen; |
1305 | nr_maps--; | 1412 | nr_maps--; |
@@ -1308,8 +1415,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) | |||
1308 | done: | 1415 | done: |
1309 | downgrade_write(&osdc->map_sem); | 1416 | downgrade_write(&osdc->map_sem); |
1310 | ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); | 1417 | ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); |
1311 | if (newmap) | 1418 | send_queued(osdc); |
1312 | kick_requests(osdc, NULL); | ||
1313 | up_read(&osdc->map_sem); | 1419 | up_read(&osdc->map_sem); |
1314 | wake_up_all(&osdc->client->auth_wq); | 1420 | wake_up_all(&osdc->client->auth_wq); |
1315 | return; | 1421 | return; |
@@ -1322,6 +1428,223 @@ bad: | |||
1322 | } | 1428 | } |
1323 | 1429 | ||
1324 | /* | 1430 | /* |
1431 | * watch/notify callback event infrastructure | ||
1432 | * | ||
1433 | * These callbacks are used both for watch and notify operations. | ||
1434 | */ | ||
1435 | static void __release_event(struct kref *kref) | ||
1436 | { | ||
1437 | struct ceph_osd_event *event = | ||
1438 | container_of(kref, struct ceph_osd_event, kref); | ||
1439 | |||
1440 | dout("__release_event %p\n", event); | ||
1441 | kfree(event); | ||
1442 | } | ||
1443 | |||
1444 | static void get_event(struct ceph_osd_event *event) | ||
1445 | { | ||
1446 | kref_get(&event->kref); | ||
1447 | } | ||
1448 | |||
1449 | void ceph_osdc_put_event(struct ceph_osd_event *event) | ||
1450 | { | ||
1451 | kref_put(&event->kref, __release_event); | ||
1452 | } | ||
1453 | EXPORT_SYMBOL(ceph_osdc_put_event); | ||
1454 | |||
1455 | static void __insert_event(struct ceph_osd_client *osdc, | ||
1456 | struct ceph_osd_event *new) | ||
1457 | { | ||
1458 | struct rb_node **p = &osdc->event_tree.rb_node; | ||
1459 | struct rb_node *parent = NULL; | ||
1460 | struct ceph_osd_event *event = NULL; | ||
1461 | |||
1462 | while (*p) { | ||
1463 | parent = *p; | ||
1464 | event = rb_entry(parent, struct ceph_osd_event, node); | ||
1465 | if (new->cookie < event->cookie) | ||
1466 | p = &(*p)->rb_left; | ||
1467 | else if (new->cookie > event->cookie) | ||
1468 | p = &(*p)->rb_right; | ||
1469 | else | ||
1470 | BUG(); | ||
1471 | } | ||
1472 | |||
1473 | rb_link_node(&new->node, parent, p); | ||
1474 | rb_insert_color(&new->node, &osdc->event_tree); | ||
1475 | } | ||
1476 | |||
1477 | static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, | ||
1478 | u64 cookie) | ||
1479 | { | ||
1480 | struct rb_node **p = &osdc->event_tree.rb_node; | ||
1481 | struct rb_node *parent = NULL; | ||
1482 | struct ceph_osd_event *event = NULL; | ||
1483 | |||
1484 | while (*p) { | ||
1485 | parent = *p; | ||
1486 | event = rb_entry(parent, struct ceph_osd_event, node); | ||
1487 | if (cookie < event->cookie) | ||
1488 | p = &(*p)->rb_left; | ||
1489 | else if (cookie > event->cookie) | ||
1490 | p = &(*p)->rb_right; | ||
1491 | else | ||
1492 | return event; | ||
1493 | } | ||
1494 | return NULL; | ||
1495 | } | ||
1496 | |||
1497 | static void __remove_event(struct ceph_osd_event *event) | ||
1498 | { | ||
1499 | struct ceph_osd_client *osdc = event->osdc; | ||
1500 | |||
1501 | if (!RB_EMPTY_NODE(&event->node)) { | ||
1502 | dout("__remove_event removed %p\n", event); | ||
1503 | rb_erase(&event->node, &osdc->event_tree); | ||
1504 | ceph_osdc_put_event(event); | ||
1505 | } else { | ||
1506 | dout("__remove_event didn't remove %p\n", event); | ||
1507 | } | ||
1508 | } | ||
1509 | |||
1510 | int ceph_osdc_create_event(struct ceph_osd_client *osdc, | ||
1511 | void (*event_cb)(u64, u64, u8, void *), | ||
1512 | int one_shot, void *data, | ||
1513 | struct ceph_osd_event **pevent) | ||
1514 | { | ||
1515 | struct ceph_osd_event *event; | ||
1516 | |||
1517 | event = kmalloc(sizeof(*event), GFP_NOIO); | ||
1518 | if (!event) | ||
1519 | return -ENOMEM; | ||
1520 | |||
1521 | dout("create_event %p\n", event); | ||
1522 | event->cb = event_cb; | ||
1523 | event->one_shot = one_shot; | ||
1524 | event->data = data; | ||
1525 | event->osdc = osdc; | ||
1526 | INIT_LIST_HEAD(&event->osd_node); | ||
1527 | kref_init(&event->kref); /* one ref for us */ | ||
1528 | kref_get(&event->kref); /* one ref for the caller */ | ||
1529 | init_completion(&event->completion); | ||
1530 | |||
1531 | spin_lock(&osdc->event_lock); | ||
1532 | event->cookie = ++osdc->event_count; | ||
1533 | __insert_event(osdc, event); | ||
1534 | spin_unlock(&osdc->event_lock); | ||
1535 | |||
1536 | *pevent = event; | ||
1537 | return 0; | ||
1538 | } | ||
1539 | EXPORT_SYMBOL(ceph_osdc_create_event); | ||
1540 | |||
1541 | void ceph_osdc_cancel_event(struct ceph_osd_event *event) | ||
1542 | { | ||
1543 | struct ceph_osd_client *osdc = event->osdc; | ||
1544 | |||
1545 | dout("cancel_event %p\n", event); | ||
1546 | spin_lock(&osdc->event_lock); | ||
1547 | __remove_event(event); | ||
1548 | spin_unlock(&osdc->event_lock); | ||
1549 | ceph_osdc_put_event(event); /* caller's */ | ||
1550 | } | ||
1551 | EXPORT_SYMBOL(ceph_osdc_cancel_event); | ||
1552 | |||
1553 | |||
1554 | static void do_event_work(struct work_struct *work) | ||
1555 | { | ||
1556 | struct ceph_osd_event_work *event_work = | ||
1557 | container_of(work, struct ceph_osd_event_work, work); | ||
1558 | struct ceph_osd_event *event = event_work->event; | ||
1559 | u64 ver = event_work->ver; | ||
1560 | u64 notify_id = event_work->notify_id; | ||
1561 | u8 opcode = event_work->opcode; | ||
1562 | |||
1563 | dout("do_event_work completing %p\n", event); | ||
1564 | event->cb(ver, notify_id, opcode, event->data); | ||
1565 | complete(&event->completion); | ||
1566 | dout("do_event_work completed %p\n", event); | ||
1567 | ceph_osdc_put_event(event); | ||
1568 | kfree(event_work); | ||
1569 | } | ||
1570 | |||
1571 | |||
1572 | /* | ||
1573 | * Process osd watch notifications | ||
1574 | */ | ||
1575 | void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) | ||
1576 | { | ||
1577 | void *p, *end; | ||
1578 | u8 proto_ver; | ||
1579 | u64 cookie, ver, notify_id; | ||
1580 | u8 opcode; | ||
1581 | struct ceph_osd_event *event; | ||
1582 | struct ceph_osd_event_work *event_work; | ||
1583 | |||
1584 | p = msg->front.iov_base; | ||
1585 | end = p + msg->front.iov_len; | ||
1586 | |||
1587 | ceph_decode_8_safe(&p, end, proto_ver, bad); | ||
1588 | ceph_decode_8_safe(&p, end, opcode, bad); | ||
1589 | ceph_decode_64_safe(&p, end, cookie, bad); | ||
1590 | ceph_decode_64_safe(&p, end, ver, bad); | ||
1591 | ceph_decode_64_safe(&p, end, notify_id, bad); | ||
1592 | |||
1593 | spin_lock(&osdc->event_lock); | ||
1594 | event = __find_event(osdc, cookie); | ||
1595 | if (event) { | ||
1596 | get_event(event); | ||
1597 | if (event->one_shot) | ||
1598 | __remove_event(event); | ||
1599 | } | ||
1600 | spin_unlock(&osdc->event_lock); | ||
1601 | dout("handle_watch_notify cookie %lld ver %lld event %p\n", | ||
1602 | cookie, ver, event); | ||
1603 | if (event) { | ||
1604 | event_work = kmalloc(sizeof(*event_work), GFP_NOIO); | ||
1605 | INIT_WORK(&event_work->work, do_event_work); | ||
1606 | if (!event_work) { | ||
1607 | dout("ERROR: could not allocate event_work\n"); | ||
1608 | goto done_err; | ||
1609 | } | ||
1610 | event_work->event = event; | ||
1611 | event_work->ver = ver; | ||
1612 | event_work->notify_id = notify_id; | ||
1613 | event_work->opcode = opcode; | ||
1614 | if (!queue_work(osdc->notify_wq, &event_work->work)) { | ||
1615 | dout("WARNING: failed to queue notify event work\n"); | ||
1616 | goto done_err; | ||
1617 | } | ||
1618 | } | ||
1619 | |||
1620 | return; | ||
1621 | |||
1622 | done_err: | ||
1623 | complete(&event->completion); | ||
1624 | ceph_osdc_put_event(event); | ||
1625 | return; | ||
1626 | |||
1627 | bad: | ||
1628 | pr_err("osdc handle_watch_notify corrupt msg\n"); | ||
1629 | return; | ||
1630 | } | ||
1631 | |||
1632 | int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout) | ||
1633 | { | ||
1634 | int err; | ||
1635 | |||
1636 | dout("wait_event %p\n", event); | ||
1637 | err = wait_for_completion_interruptible_timeout(&event->completion, | ||
1638 | timeout * HZ); | ||
1639 | ceph_osdc_put_event(event); | ||
1640 | if (err > 0) | ||
1641 | err = 0; | ||
1642 | dout("wait_event %p returns %d\n", event, err); | ||
1643 | return err; | ||
1644 | } | ||
1645 | EXPORT_SYMBOL(ceph_osdc_wait_event); | ||
1646 | |||
1647 | /* | ||
1325 | * Register request, send initial attempt. | 1648 | * Register request, send initial attempt. |
1326 | */ | 1649 | */ |
1327 | int ceph_osdc_start_request(struct ceph_osd_client *osdc, | 1650 | int ceph_osdc_start_request(struct ceph_osd_client *osdc, |
@@ -1347,15 +1670,22 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, | |||
1347 | * the request still han't been touched yet. | 1670 | * the request still han't been touched yet. |
1348 | */ | 1671 | */ |
1349 | if (req->r_sent == 0) { | 1672 | if (req->r_sent == 0) { |
1350 | rc = __send_request(osdc, req); | 1673 | rc = __map_request(osdc, req); |
1351 | if (rc) { | 1674 | if (rc < 0) |
1352 | if (nofail) { | 1675 | return rc; |
1353 | dout("osdc_start_request failed send, " | 1676 | if (req->r_osd == NULL) { |
1354 | " marking %lld\n", req->r_tid); | 1677 | dout("send_request %p no up osds in pg\n", req); |
1355 | req->r_resend = true; | 1678 | ceph_monc_request_next_osdmap(&osdc->client->monc); |
1356 | rc = 0; | 1679 | } else { |
1357 | } else { | 1680 | rc = __send_request(osdc, req); |
1358 | __unregister_request(osdc, req); | 1681 | if (rc) { |
1682 | if (nofail) { | ||
1683 | dout("osdc_start_request failed send, " | ||
1684 | " will retry %lld\n", req->r_tid); | ||
1685 | rc = 0; | ||
1686 | } else { | ||
1687 | __unregister_request(osdc, req); | ||
1688 | } | ||
1359 | } | 1689 | } |
1360 | } | 1690 | } |
1361 | } | 1691 | } |
@@ -1441,9 +1771,15 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) | |||
1441 | INIT_LIST_HEAD(&osdc->osd_lru); | 1771 | INIT_LIST_HEAD(&osdc->osd_lru); |
1442 | osdc->requests = RB_ROOT; | 1772 | osdc->requests = RB_ROOT; |
1443 | INIT_LIST_HEAD(&osdc->req_lru); | 1773 | INIT_LIST_HEAD(&osdc->req_lru); |
1774 | INIT_LIST_HEAD(&osdc->req_unsent); | ||
1775 | INIT_LIST_HEAD(&osdc->req_notarget); | ||
1776 | INIT_LIST_HEAD(&osdc->req_linger); | ||
1444 | osdc->num_requests = 0; | 1777 | osdc->num_requests = 0; |
1445 | INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); | 1778 | INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); |
1446 | INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); | 1779 | INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); |
1780 | spin_lock_init(&osdc->event_lock); | ||
1781 | osdc->event_tree = RB_ROOT; | ||
1782 | osdc->event_count = 0; | ||
1447 | 1783 | ||
1448 | schedule_delayed_work(&osdc->osds_timeout_work, | 1784 | schedule_delayed_work(&osdc->osds_timeout_work, |
1449 | round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); | 1785 | round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); |
@@ -1463,6 +1799,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) | |||
1463 | "osd_op_reply"); | 1799 | "osd_op_reply"); |
1464 | if (err < 0) | 1800 | if (err < 0) |
1465 | goto out_msgpool; | 1801 | goto out_msgpool; |
1802 | |||
1803 | osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); | ||
1804 | if (IS_ERR(osdc->notify_wq)) { | ||
1805 | err = PTR_ERR(osdc->notify_wq); | ||
1806 | osdc->notify_wq = NULL; | ||
1807 | goto out_msgpool; | ||
1808 | } | ||
1466 | return 0; | 1809 | return 0; |
1467 | 1810 | ||
1468 | out_msgpool: | 1811 | out_msgpool: |
@@ -1476,6 +1819,8 @@ EXPORT_SYMBOL(ceph_osdc_init); | |||
1476 | 1819 | ||
1477 | void ceph_osdc_stop(struct ceph_osd_client *osdc) | 1820 | void ceph_osdc_stop(struct ceph_osd_client *osdc) |
1478 | { | 1821 | { |
1822 | flush_workqueue(osdc->notify_wq); | ||
1823 | destroy_workqueue(osdc->notify_wq); | ||
1479 | cancel_delayed_work_sync(&osdc->timeout_work); | 1824 | cancel_delayed_work_sync(&osdc->timeout_work); |
1480 | cancel_delayed_work_sync(&osdc->osds_timeout_work); | 1825 | cancel_delayed_work_sync(&osdc->osds_timeout_work); |
1481 | if (osdc->osdmap) { | 1826 | if (osdc->osdmap) { |
@@ -1483,6 +1828,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) | |||
1483 | osdc->osdmap = NULL; | 1828 | osdc->osdmap = NULL; |
1484 | } | 1829 | } |
1485 | remove_old_osds(osdc, 1); | 1830 | remove_old_osds(osdc, 1); |
1831 | WARN_ON(!RB_EMPTY_ROOT(&osdc->osds)); | ||
1486 | mempool_destroy(osdc->req_mempool); | 1832 | mempool_destroy(osdc->req_mempool); |
1487 | ceph_msgpool_destroy(&osdc->msgpool_op); | 1833 | ceph_msgpool_destroy(&osdc->msgpool_op); |
1488 | ceph_msgpool_destroy(&osdc->msgpool_op_reply); | 1834 | ceph_msgpool_destroy(&osdc->msgpool_op_reply); |
@@ -1591,6 +1937,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) | |||
1591 | case CEPH_MSG_OSD_OPREPLY: | 1937 | case CEPH_MSG_OSD_OPREPLY: |
1592 | handle_reply(osdc, msg, con); | 1938 | handle_reply(osdc, msg, con); |
1593 | break; | 1939 | break; |
1940 | case CEPH_MSG_WATCH_NOTIFY: | ||
1941 | handle_watch_notify(osdc, msg); | ||
1942 | break; | ||
1594 | 1943 | ||
1595 | default: | 1944 | default: |
1596 | pr_err("received unknown message type %d %s\n", type, | 1945 | pr_err("received unknown message type %d %s\n", type, |
@@ -1684,6 +2033,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, | |||
1684 | 2033 | ||
1685 | switch (type) { | 2034 | switch (type) { |
1686 | case CEPH_MSG_OSD_MAP: | 2035 | case CEPH_MSG_OSD_MAP: |
2036 | case CEPH_MSG_WATCH_NOTIFY: | ||
1687 | return ceph_msg_new(type, front, GFP_NOFS); | 2037 | return ceph_msg_new(type, front, GFP_NOFS); |
1688 | case CEPH_MSG_OSD_OPREPLY: | 2038 | case CEPH_MSG_OSD_OPREPLY: |
1689 | return get_reply(con, hdr, skip); | 2039 | return get_reply(con, hdr, skip); |