diff options
author | Ilya Dryomov <idryomov@gmail.com> | 2016-05-25 19:15:02 -0400 |
---|---|---|
committer | Ilya Dryomov <idryomov@gmail.com> | 2016-05-25 19:15:02 -0400 |
commit | 922dab6134178cae317ae00de86376cba59f3147 (patch) | |
tree | a7047a5950b6a8505cc1e6852e4532656064fede /include/linux/ceph | |
parent | c525f03601f52c83ded046624138f2a45e0ba56c (diff) |
libceph, rbd: ceph_osd_linger_request, watch/notify v2
This adds support and switches rbd to a new, more reliable version of
watch/notify protocol. As with the OSD client update, this is mostly
about getting the right structures linked into the right places so that
reconnects are properly sent when needed. watch/notify v2 also
requires sending regular pings to the OSDs - send_linger_ping().
A major change from the old watch/notify implementation is the
introduction of ceph_osd_linger_request - linger requests no longer
piggy back on ceph_osd_request. ceph_osd_event has been merged into
ceph_osd_linger_request.
All the details are now hidden within libceph, the interface consists
of a simple pair of watch/unwatch functions and ceph_osdc_notify_ack().
ceph_osdc_watch() does return ceph_osd_linger_request, but only to keep
the lifetime management simple.
ceph_osdc_notify_ack() accepts an optional data payload, which is
relayed back to the notifier.
Portions of this patch are loosely based on work by Douglas Fuller
<dfuller@redhat.com> and Mike Christie <michaelc@cs.wisc.edu>.
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Diffstat (limited to 'include/linux/ceph')
-rw-r--r-- | include/linux/ceph/ceph_fs.h | 5 | ||||
-rw-r--r-- | include/linux/ceph/osd_client.h | 97 | ||||
-rw-r--r-- | include/linux/ceph/rados.h | 17 |
3 files changed, 75 insertions, 44 deletions
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 37f28bf55ce4..3b911ff889dd 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h | |||
@@ -153,8 +153,9 @@ struct ceph_dir_layout { | |||
153 | 153 | ||
154 | /* watch-notify operations */ | 154 | /* watch-notify operations */ |
155 | enum { | 155 | enum { |
156 | WATCH_NOTIFY = 1, /* notifying watcher */ | 156 | CEPH_WATCH_EVENT_NOTIFY = 1, /* notifying watcher */ |
157 | WATCH_NOTIFY_COMPLETE = 2, /* notifier notified when done */ | 157 | CEPH_WATCH_EVENT_NOTIFY_COMPLETE = 2, /* notifier notified when done */ |
158 | CEPH_WATCH_EVENT_DISCONNECT = 3, /* we were disconnected */ | ||
158 | }; | 159 | }; |
159 | 160 | ||
160 | 161 | ||
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 342f22f1f040..cd2dcb8939de 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h | |||
@@ -34,7 +34,7 @@ struct ceph_osd { | |||
34 | struct rb_node o_node; | 34 | struct rb_node o_node; |
35 | struct ceph_connection o_con; | 35 | struct ceph_connection o_con; |
36 | struct rb_root o_requests; | 36 | struct rb_root o_requests; |
37 | struct list_head o_linger_requests; | 37 | struct rb_root o_linger_requests; |
38 | struct list_head o_osd_lru; | 38 | struct list_head o_osd_lru; |
39 | struct ceph_auth_handshake o_auth; | 39 | struct ceph_auth_handshake o_auth; |
40 | unsigned long lru_ttl; | 40 | unsigned long lru_ttl; |
@@ -108,12 +108,13 @@ struct ceph_osd_req_op { | |||
108 | } cls; | 108 | } cls; |
109 | struct { | 109 | struct { |
110 | u64 cookie; | 110 | u64 cookie; |
111 | u64 ver; | 111 | __u8 op; /* CEPH_OSD_WATCH_OP_ */ |
112 | u32 prot_ver; | 112 | u32 gen; |
113 | u32 timeout; | ||
114 | __u8 flag; | ||
115 | } watch; | 113 | } watch; |
116 | struct { | 114 | struct { |
115 | struct ceph_osd_data request_data; | ||
116 | } notify_ack; | ||
117 | struct { | ||
117 | u64 expected_object_size; | 118 | u64 expected_object_size; |
118 | u64 expected_write_size; | 119 | u64 expected_write_size; |
119 | } alloc_hint; | 120 | } alloc_hint; |
@@ -145,8 +146,6 @@ struct ceph_osd_request_target { | |||
145 | struct ceph_osd_request { | 146 | struct ceph_osd_request { |
146 | u64 r_tid; /* unique for this client */ | 147 | u64 r_tid; /* unique for this client */ |
147 | struct rb_node r_node; | 148 | struct rb_node r_node; |
148 | struct list_head r_linger_item; | ||
149 | struct list_head r_linger_osd_item; | ||
150 | struct ceph_osd *r_osd; | 149 | struct ceph_osd *r_osd; |
151 | 150 | ||
152 | struct ceph_osd_request_target r_t; | 151 | struct ceph_osd_request_target r_t; |
@@ -162,7 +161,6 @@ struct ceph_osd_request { | |||
162 | 161 | ||
163 | int r_result; | 162 | int r_result; |
164 | bool r_got_reply; | 163 | bool r_got_reply; |
165 | int r_linger; | ||
166 | 164 | ||
167 | struct ceph_osd_client *r_osdc; | 165 | struct ceph_osd_client *r_osdc; |
168 | struct kref r_kref; | 166 | struct kref r_kref; |
@@ -181,6 +179,7 @@ struct ceph_osd_request { | |||
181 | struct ceph_snap_context *r_snapc; /* for writes */ | 179 | struct ceph_snap_context *r_snapc; /* for writes */ |
182 | struct timespec r_mtime; /* ditto */ | 180 | struct timespec r_mtime; /* ditto */ |
183 | u64 r_data_offset; /* ditto */ | 181 | u64 r_data_offset; /* ditto */ |
182 | bool r_linger; /* don't resend on failure */ | ||
184 | 183 | ||
185 | /* internal */ | 184 | /* internal */ |
186 | unsigned long r_stamp; /* jiffies, send or check time */ | 185 | unsigned long r_stamp; /* jiffies, send or check time */ |
@@ -195,23 +194,40 @@ struct ceph_request_redirect { | |||
195 | struct ceph_object_locator oloc; | 194 | struct ceph_object_locator oloc; |
196 | }; | 195 | }; |
197 | 196 | ||
198 | struct ceph_osd_event { | 197 | typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie, |
199 | u64 cookie; | 198 | u64 notifier_id, void *data, size_t data_len); |
200 | int one_shot; | 199 | typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err); |
200 | |||
201 | struct ceph_osd_linger_request { | ||
201 | struct ceph_osd_client *osdc; | 202 | struct ceph_osd_client *osdc; |
202 | void (*cb)(u64, u64, u8, void *); | 203 | u64 linger_id; |
203 | void *data; | 204 | bool committed; |
204 | struct rb_node node; | 205 | |
205 | struct list_head osd_node; | 206 | struct ceph_osd *osd; |
207 | struct ceph_osd_request *reg_req; | ||
208 | struct ceph_osd_request *ping_req; | ||
209 | unsigned long ping_sent; | ||
210 | |||
211 | struct ceph_osd_request_target t; | ||
212 | u32 last_force_resend; | ||
213 | |||
214 | struct timespec mtime; | ||
215 | |||
206 | struct kref kref; | 216 | struct kref kref; |
207 | }; | 217 | struct mutex lock; |
218 | struct rb_node node; /* osd */ | ||
219 | struct rb_node osdc_node; /* osdc */ | ||
220 | struct list_head scan_item; | ||
221 | |||
222 | struct completion reg_commit_wait; | ||
223 | int reg_commit_error; | ||
224 | int last_error; | ||
225 | |||
226 | u32 register_gen; | ||
208 | 227 | ||
209 | struct ceph_osd_event_work { | 228 | rados_watchcb2_t wcb; |
210 | struct work_struct work; | 229 | rados_watcherrcb_t errcb; |
211 | struct ceph_osd_event *event; | 230 | void *data; |
212 | u64 ver; | ||
213 | u64 notify_id; | ||
214 | u8 opcode; | ||
215 | }; | 231 | }; |
216 | 232 | ||
217 | struct ceph_osd_client { | 233 | struct ceph_osd_client { |
@@ -223,9 +239,10 @@ struct ceph_osd_client { | |||
223 | struct rb_root osds; /* osds */ | 239 | struct rb_root osds; /* osds */ |
224 | struct list_head osd_lru; /* idle osds */ | 240 | struct list_head osd_lru; /* idle osds */ |
225 | spinlock_t osd_lru_lock; | 241 | spinlock_t osd_lru_lock; |
226 | struct list_head req_linger; /* lingering requests */ | ||
227 | struct ceph_osd homeless_osd; | 242 | struct ceph_osd homeless_osd; |
228 | atomic64_t last_tid; /* tid of last request */ | 243 | atomic64_t last_tid; /* tid of last request */ |
244 | u64 last_linger_id; | ||
245 | struct rb_root linger_requests; /* lingering requests */ | ||
229 | atomic_t num_requests; | 246 | atomic_t num_requests; |
230 | atomic_t num_homeless; | 247 | atomic_t num_homeless; |
231 | struct delayed_work timeout_work; | 248 | struct delayed_work timeout_work; |
@@ -239,10 +256,6 @@ struct ceph_osd_client { | |||
239 | struct ceph_msgpool msgpool_op; | 256 | struct ceph_msgpool msgpool_op; |
240 | struct ceph_msgpool msgpool_op_reply; | 257 | struct ceph_msgpool msgpool_op_reply; |
241 | 258 | ||
242 | spinlock_t event_lock; | ||
243 | struct rb_root event_tree; | ||
244 | u64 event_count; | ||
245 | |||
246 | struct workqueue_struct *notify_wq; | 259 | struct workqueue_struct *notify_wq; |
247 | }; | 260 | }; |
248 | 261 | ||
@@ -314,9 +327,6 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, | |||
314 | extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, | 327 | extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, |
315 | u16 opcode, const char *name, const void *value, | 328 | u16 opcode, const char *name, const void *value, |
316 | size_t size, u8 cmp_op, u8 cmp_mode); | 329 | size_t size, u8 cmp_op, u8 cmp_mode); |
317 | extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, | ||
318 | unsigned int which, u16 opcode, | ||
319 | u64 cookie, u64 version, int flag); | ||
320 | extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, | 330 | extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, |
321 | unsigned int which, | 331 | unsigned int which, |
322 | u64 expected_object_size, | 332 | u64 expected_object_size, |
@@ -339,9 +349,6 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, | |||
339 | u32 truncate_seq, u64 truncate_size, | 349 | u32 truncate_seq, u64 truncate_size, |
340 | bool use_mempool); | 350 | bool use_mempool); |
341 | 351 | ||
342 | extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, | ||
343 | struct ceph_osd_request *req); | ||
344 | |||
345 | extern void ceph_osdc_get_request(struct ceph_osd_request *req); | 352 | extern void ceph_osdc_get_request(struct ceph_osd_request *req); |
346 | extern void ceph_osdc_put_request(struct ceph_osd_request *req); | 353 | extern void ceph_osdc_put_request(struct ceph_osd_request *req); |
347 | 354 | ||
@@ -372,11 +379,23 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, | |||
372 | struct timespec *mtime, | 379 | struct timespec *mtime, |
373 | struct page **pages, int nr_pages); | 380 | struct page **pages, int nr_pages); |
374 | 381 | ||
375 | /* watch/notify events */ | 382 | /* watch/notify */ |
376 | extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, | 383 | struct ceph_osd_linger_request * |
377 | void (*event_cb)(u64, u64, u8, void *), | 384 | ceph_osdc_watch(struct ceph_osd_client *osdc, |
378 | void *data, struct ceph_osd_event **pevent); | 385 | struct ceph_object_id *oid, |
379 | extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); | 386 | struct ceph_object_locator *oloc, |
380 | extern void ceph_osdc_put_event(struct ceph_osd_event *event); | 387 | rados_watchcb2_t wcb, |
388 | rados_watcherrcb_t errcb, | ||
389 | void *data); | ||
390 | int ceph_osdc_unwatch(struct ceph_osd_client *osdc, | ||
391 | struct ceph_osd_linger_request *lreq); | ||
392 | |||
393 | int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, | ||
394 | struct ceph_object_id *oid, | ||
395 | struct ceph_object_locator *oloc, | ||
396 | u64 notify_id, | ||
397 | u64 cookie, | ||
398 | void *payload, | ||
399 | size_t payload_len); | ||
381 | #endif | 400 | #endif |
382 | 401 | ||
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 28740a58f32c..204c8c944703 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h | |||
@@ -427,7 +427,17 @@ enum { | |||
427 | CEPH_OSD_CMPXATTR_MODE_U64 = 2 | 427 | CEPH_OSD_CMPXATTR_MODE_U64 = 2 |
428 | }; | 428 | }; |
429 | 429 | ||
430 | #define RADOS_NOTIFY_VER 1 | 430 | enum { |
431 | CEPH_OSD_WATCH_OP_UNWATCH = 0, | ||
432 | CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1, | ||
433 | /* note: use only ODD ids to prevent pre-giant code from | ||
434 | interpreting the op as UNWATCH */ | ||
435 | CEPH_OSD_WATCH_OP_WATCH = 3, | ||
436 | CEPH_OSD_WATCH_OP_RECONNECT = 5, | ||
437 | CEPH_OSD_WATCH_OP_PING = 7, | ||
438 | }; | ||
439 | |||
440 | const char *ceph_osd_watch_op_name(int o); | ||
431 | 441 | ||
432 | /* | 442 | /* |
433 | * an individual object operation. each may be accompanied by some data | 443 | * an individual object operation. each may be accompanied by some data |
@@ -462,8 +472,9 @@ struct ceph_osd_op { | |||
462 | } __attribute__ ((packed)) snap; | 472 | } __attribute__ ((packed)) snap; |
463 | struct { | 473 | struct { |
464 | __le64 cookie; | 474 | __le64 cookie; |
465 | __le64 ver; | 475 | __le64 ver; /* no longer used */ |
466 | __u8 flag; /* 0 = unwatch, 1 = watch */ | 476 | __u8 op; /* CEPH_OSD_WATCH_OP_* */ |
477 | __le32 gen; /* registration generation */ | ||
467 | } __attribute__ ((packed)) watch; | 478 | } __attribute__ ((packed)) watch; |
468 | struct { | 479 | struct { |
469 | __le64 offset, length; | 480 | __le64 offset, length; |