diff options
-rw-r--r-- | drivers/block/rbd.c | 52 | ||||
-rw-r--r-- | fs/ceph/addr.c | 31 | ||||
-rw-r--r-- | include/linux/ceph/osd_client.h | 19 | ||||
-rw-r--r-- | include/linux/ceph/rados.h | 38 | ||||
-rw-r--r-- | net/ceph/debugfs.c | 18 | ||||
-rw-r--r-- | net/ceph/osd_client.c | 233 |
6 files changed, 222 insertions, 169 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 22085e86a409..6c81a4c040b9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c | |||
@@ -196,7 +196,7 @@ struct rbd_obj_request { | |||
196 | 196 | ||
197 | u64 xferred; /* bytes transferred */ | 197 | u64 xferred; /* bytes transferred */ |
198 | u64 version; | 198 | u64 version; |
199 | s32 result; | 199 | int result; |
200 | atomic_t done; | 200 | atomic_t done; |
201 | 201 | ||
202 | rbd_obj_callback_t callback; | 202 | rbd_obj_callback_t callback; |
@@ -1282,12 +1282,19 @@ static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) | |||
1282 | 1282 | ||
1283 | static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) | 1283 | static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) |
1284 | { | 1284 | { |
1285 | |||
1286 | dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, | 1285 | dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, |
1287 | obj_request->result, obj_request->xferred, obj_request->length); | 1286 | obj_request->result, obj_request->xferred, obj_request->length); |
1288 | if (obj_request->result == (s32) -ENOENT) { | 1287 | /* |
1288 | * ENOENT means a hole in the object. We zero-fill the | ||
1289 | * entire length of the request. A short read also implies | ||
1290 | * zero-fill to the end of the request. Either way we | ||
1291 | * update the xferred count to indicate the whole request | ||
1292 | * was satisfied. | ||
1293 | */ | ||
1294 | if (obj_request->result == -ENOENT) { | ||
1289 | zero_bio_chain(obj_request->bio_list, 0); | 1295 | zero_bio_chain(obj_request->bio_list, 0); |
1290 | obj_request->result = 0; | 1296 | obj_request->result = 0; |
1297 | obj_request->xferred = obj_request->length; | ||
1291 | } else if (obj_request->xferred < obj_request->length && | 1298 | } else if (obj_request->xferred < obj_request->length && |
1292 | !obj_request->result) { | 1299 | !obj_request->result) { |
1293 | zero_bio_chain(obj_request->bio_list, obj_request->xferred); | 1300 | zero_bio_chain(obj_request->bio_list, obj_request->xferred); |
@@ -1298,20 +1305,14 @@ static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) | |||
1298 | 1305 | ||
1299 | static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) | 1306 | static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) |
1300 | { | 1307 | { |
1301 | dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request, | 1308 | dout("%s: obj %p result %d %llu\n", __func__, obj_request, |
1302 | obj_request->result, obj_request->xferred, obj_request->length); | 1309 | obj_request->result, obj_request->length); |
1303 | 1310 | /* | |
1304 | /* A short write really shouldn't occur. Warn if we see one */ | 1311 | * There is no such thing as a successful short write. |
1305 | 1312 | * Our xferred value is the number of bytes transferred | |
1306 | if (obj_request->xferred != obj_request->length) { | 1313 | * back. Set it to our originally-requested length. |
1307 | struct rbd_img_request *img_request = obj_request->img_request; | 1314 | */ |
1308 | struct rbd_device *rbd_dev; | 1315 | obj_request->xferred = obj_request->length; |
1309 | |||
1310 | rbd_dev = img_request ? img_request->rbd_dev : NULL; | ||
1311 | rbd_warn(rbd_dev, "wrote %llu want %llu\n", | ||
1312 | obj_request->xferred, obj_request->length); | ||
1313 | } | ||
1314 | |||
1315 | obj_request_done_set(obj_request); | 1316 | obj_request_done_set(obj_request); |
1316 | } | 1317 | } |
1317 | 1318 | ||
@@ -1329,9 +1330,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, | |||
1329 | struct ceph_msg *msg) | 1330 | struct ceph_msg *msg) |
1330 | { | 1331 | { |
1331 | struct rbd_obj_request *obj_request = osd_req->r_priv; | 1332 | struct rbd_obj_request *obj_request = osd_req->r_priv; |
1332 | struct ceph_osd_reply_head *reply_head; | ||
1333 | struct ceph_osd_op *op; | ||
1334 | u32 num_ops; | ||
1335 | u16 opcode; | 1333 | u16 opcode; |
1336 | 1334 | ||
1337 | dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); | 1335 | dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); |
@@ -1339,22 +1337,19 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, | |||
1339 | rbd_assert(!!obj_request->img_request ^ | 1337 | rbd_assert(!!obj_request->img_request ^ |
1340 | (obj_request->which == BAD_WHICH)); | 1338 | (obj_request->which == BAD_WHICH)); |
1341 | 1339 | ||
1342 | reply_head = msg->front.iov_base; | 1340 | if (osd_req->r_result < 0) |
1343 | obj_request->result = (s32) le32_to_cpu(reply_head->result); | 1341 | obj_request->result = osd_req->r_result; |
1344 | obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); | 1342 | obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version); |
1345 | 1343 | ||
1346 | num_ops = le32_to_cpu(reply_head->num_ops); | 1344 | WARN_ON(osd_req->r_num_ops != 1); /* For now */ |
1347 | WARN_ON(num_ops != 1); /* For now */ | ||
1348 | 1345 | ||
1349 | /* | 1346 | /* |
1350 | * We support a 64-bit length, but ultimately it has to be | 1347 | * We support a 64-bit length, but ultimately it has to be |
1351 | * passed to blk_end_request(), which takes an unsigned int. | 1348 | * passed to blk_end_request(), which takes an unsigned int. |
1352 | */ | 1349 | */ |
1353 | op = &reply_head->ops[0]; | 1350 | obj_request->xferred = osd_req->r_reply_op_len[0]; |
1354 | obj_request->xferred = le64_to_cpu(op->extent.length); | ||
1355 | rbd_assert(obj_request->xferred < (u64) UINT_MAX); | 1351 | rbd_assert(obj_request->xferred < (u64) UINT_MAX); |
1356 | 1352 | opcode = osd_req->r_request_ops[0].op; | |
1357 | opcode = le16_to_cpu(op->op); | ||
1358 | switch (opcode) { | 1353 | switch (opcode) { |
1359 | case CEPH_OSD_OP_READ: | 1354 | case CEPH_OSD_OP_READ: |
1360 | rbd_osd_read_callback(obj_request); | 1355 | rbd_osd_read_callback(obj_request); |
@@ -1719,6 +1714,7 @@ static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) | |||
1719 | more = blk_end_request(img_request->rq, result, xferred); | 1714 | more = blk_end_request(img_request->rq, result, xferred); |
1720 | which++; | 1715 | which++; |
1721 | } | 1716 | } |
1717 | |||
1722 | rbd_assert(more ^ (which == img_request->obj_request_count)); | 1718 | rbd_assert(more ^ (which == img_request->obj_request_count)); |
1723 | img_request->next_completion = which; | 1719 | img_request->next_completion = which; |
1724 | out: | 1720 | out: |
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index fc613715af46..cfef3e01a9b3 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -236,16 +236,10 @@ static int ceph_readpage(struct file *filp, struct page *page) | |||
236 | static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) | 236 | static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) |
237 | { | 237 | { |
238 | struct inode *inode = req->r_inode; | 238 | struct inode *inode = req->r_inode; |
239 | struct ceph_osd_reply_head *replyhead; | 239 | int rc = req->r_result; |
240 | int rc, bytes; | 240 | int bytes = le32_to_cpu(msg->hdr.data_len); |
241 | int i; | 241 | int i; |
242 | 242 | ||
243 | /* parse reply */ | ||
244 | replyhead = msg->front.iov_base; | ||
245 | WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); | ||
246 | rc = le32_to_cpu(replyhead->result); | ||
247 | bytes = le32_to_cpu(msg->hdr.data_len); | ||
248 | |||
249 | dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); | 243 | dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); |
250 | 244 | ||
251 | /* unlock all pages, zeroing any data we didn't read */ | 245 | /* unlock all pages, zeroing any data we didn't read */ |
@@ -553,27 +547,18 @@ static void writepages_finish(struct ceph_osd_request *req, | |||
553 | struct ceph_msg *msg) | 547 | struct ceph_msg *msg) |
554 | { | 548 | { |
555 | struct inode *inode = req->r_inode; | 549 | struct inode *inode = req->r_inode; |
556 | struct ceph_osd_reply_head *replyhead; | ||
557 | struct ceph_osd_op *op; | ||
558 | struct ceph_inode_info *ci = ceph_inode(inode); | 550 | struct ceph_inode_info *ci = ceph_inode(inode); |
559 | unsigned wrote; | 551 | unsigned wrote; |
560 | struct page *page; | 552 | struct page *page; |
561 | int i; | 553 | int i; |
562 | struct ceph_snap_context *snapc = req->r_snapc; | 554 | struct ceph_snap_context *snapc = req->r_snapc; |
563 | struct address_space *mapping = inode->i_mapping; | 555 | struct address_space *mapping = inode->i_mapping; |
564 | __s32 rc = -EIO; | 556 | int rc = req->r_result; |
565 | u64 bytes = 0; | 557 | u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length); |
566 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 558 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
567 | long writeback_stat; | 559 | long writeback_stat; |
568 | unsigned issued = ceph_caps_issued(ci); | 560 | unsigned issued = ceph_caps_issued(ci); |
569 | 561 | ||
570 | /* parse reply */ | ||
571 | replyhead = msg->front.iov_base; | ||
572 | WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); | ||
573 | op = (void *)(replyhead + 1); | ||
574 | rc = le32_to_cpu(replyhead->result); | ||
575 | bytes = le64_to_cpu(op->extent.length); | ||
576 | |||
577 | if (rc >= 0) { | 562 | if (rc >= 0) { |
578 | /* | 563 | /* |
579 | * Assume we wrote the pages we originally sent. The | 564 | * Assume we wrote the pages we originally sent. The |
@@ -740,8 +725,6 @@ retry: | |||
740 | struct page *page; | 725 | struct page *page; |
741 | int want; | 726 | int want; |
742 | u64 offset, len; | 727 | u64 offset, len; |
743 | struct ceph_osd_request_head *reqhead; | ||
744 | struct ceph_osd_op *op; | ||
745 | long writeback_stat; | 728 | long writeback_stat; |
746 | 729 | ||
747 | next = 0; | 730 | next = 0; |
@@ -905,10 +888,8 @@ get_more_pages: | |||
905 | 888 | ||
906 | /* revise final length, page count */ | 889 | /* revise final length, page count */ |
907 | req->r_num_pages = locked_pages; | 890 | req->r_num_pages = locked_pages; |
908 | reqhead = req->r_request->front.iov_base; | 891 | req->r_request_ops[0].extent.length = cpu_to_le64(len); |
909 | op = (void *)(reqhead + 1); | 892 | req->r_request_ops[0].payload_len = cpu_to_le32(len); |
910 | op->extent.length = cpu_to_le64(len); | ||
911 | op->payload_len = cpu_to_le32(len); | ||
912 | req->r_request->hdr.data_len = cpu_to_le32(len); | 893 | req->r_request->hdr.data_len = cpu_to_le32(len); |
913 | 894 | ||
914 | rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); | 895 | rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); |
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ad8899fc3157..1dd5d466b6f9 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h | |||
@@ -47,6 +47,9 @@ struct ceph_osd { | |||
47 | struct list_head o_keepalive_item; | 47 | struct list_head o_keepalive_item; |
48 | }; | 48 | }; |
49 | 49 | ||
50 | |||
51 | #define CEPH_OSD_MAX_OP 10 | ||
52 | |||
50 | /* an in-flight request */ | 53 | /* an in-flight request */ |
51 | struct ceph_osd_request { | 54 | struct ceph_osd_request { |
52 | u64 r_tid; /* unique for this client */ | 55 | u64 r_tid; /* unique for this client */ |
@@ -63,9 +66,23 @@ struct ceph_osd_request { | |||
63 | struct ceph_connection *r_con_filling_msg; | 66 | struct ceph_connection *r_con_filling_msg; |
64 | 67 | ||
65 | struct ceph_msg *r_request, *r_reply; | 68 | struct ceph_msg *r_request, *r_reply; |
66 | int r_result; | ||
67 | int r_flags; /* any additional flags for the osd */ | 69 | int r_flags; /* any additional flags for the osd */ |
68 | u32 r_sent; /* >0 if r_request is sending/sent */ | 70 | u32 r_sent; /* >0 if r_request is sending/sent */ |
71 | int r_num_ops; | ||
72 | |||
73 | /* encoded message content */ | ||
74 | struct ceph_osd_op *r_request_ops; | ||
75 | /* these are updated on each send */ | ||
76 | __le32 *r_request_osdmap_epoch; | ||
77 | __le32 *r_request_flags; | ||
78 | __le64 *r_request_pool; | ||
79 | void *r_request_pgid; | ||
80 | __le32 *r_request_attempts; | ||
81 | struct ceph_eversion *r_request_reassert_version; | ||
82 | |||
83 | int r_result; | ||
84 | int r_reply_op_len[CEPH_OSD_MAX_OP]; | ||
85 | s32 r_reply_op_result[CEPH_OSD_MAX_OP]; | ||
69 | int r_got_reply; | 86 | int r_got_reply; |
70 | int r_linger; | 87 | int r_linger; |
71 | 88 | ||
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index d784c8dfb09a..68c96a508ac2 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h | |||
@@ -416,43 +416,5 @@ struct ceph_osd_op { | |||
416 | __le32 payload_len; | 416 | __le32 payload_len; |
417 | } __attribute__ ((packed)); | 417 | } __attribute__ ((packed)); |
418 | 418 | ||
419 | /* | ||
420 | * osd request message header. each request may include multiple | ||
421 | * ceph_osd_op object operations. | ||
422 | */ | ||
423 | struct ceph_osd_request_head { | ||
424 | __le32 client_inc; /* client incarnation */ | ||
425 | struct ceph_object_layout layout; /* pgid */ | ||
426 | __le32 osdmap_epoch; /* client's osdmap epoch */ | ||
427 | |||
428 | __le32 flags; | ||
429 | |||
430 | struct ceph_timespec mtime; /* for mutations only */ | ||
431 | struct ceph_eversion reassert_version; /* if we are replaying op */ | ||
432 | |||
433 | __le32 object_len; /* length of object name */ | ||
434 | |||
435 | __le64 snapid; /* snapid to read */ | ||
436 | __le64 snap_seq; /* writer's snap context */ | ||
437 | __le32 num_snaps; | ||
438 | |||
439 | __le16 num_ops; | ||
440 | struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */ | ||
441 | } __attribute__ ((packed)); | ||
442 | |||
443 | struct ceph_osd_reply_head { | ||
444 | __le32 client_inc; /* client incarnation */ | ||
445 | __le32 flags; | ||
446 | struct ceph_object_layout layout; | ||
447 | __le32 osdmap_epoch; | ||
448 | struct ceph_eversion reassert_version; /* for replaying uncommitted */ | ||
449 | |||
450 | __le32 result; /* result code */ | ||
451 | |||
452 | __le32 object_len; /* length of object name */ | ||
453 | __le32 num_ops; | ||
454 | struct ceph_osd_op ops[0]; /* ops[], object */ | ||
455 | } __attribute__ ((packed)); | ||
456 | |||
457 | 419 | ||
458 | #endif | 420 | #endif |
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index f4d4b27d6026..00d051f4894e 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c | |||
@@ -123,10 +123,7 @@ static int osdc_show(struct seq_file *s, void *pp) | |||
123 | mutex_lock(&osdc->request_mutex); | 123 | mutex_lock(&osdc->request_mutex); |
124 | for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { | 124 | for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { |
125 | struct ceph_osd_request *req; | 125 | struct ceph_osd_request *req; |
126 | struct ceph_osd_request_head *head; | 126 | int opcode; |
127 | struct ceph_osd_op *op; | ||
128 | int num_ops; | ||
129 | int opcode, olen; | ||
130 | int i; | 127 | int i; |
131 | 128 | ||
132 | req = rb_entry(p, struct ceph_osd_request, r_node); | 129 | req = rb_entry(p, struct ceph_osd_request, r_node); |
@@ -135,13 +132,7 @@ static int osdc_show(struct seq_file *s, void *pp) | |||
135 | req->r_osd ? req->r_osd->o_osd : -1, | 132 | req->r_osd ? req->r_osd->o_osd : -1, |
136 | req->r_pgid.pool, req->r_pgid.seed); | 133 | req->r_pgid.pool, req->r_pgid.seed); |
137 | 134 | ||
138 | head = req->r_request->front.iov_base; | 135 | seq_printf(s, "%.*s", req->r_oid_len, req->r_oid); |
139 | op = (void *)(head + 1); | ||
140 | |||
141 | num_ops = le16_to_cpu(head->num_ops); | ||
142 | olen = le32_to_cpu(head->object_len); | ||
143 | seq_printf(s, "%.*s", olen, | ||
144 | (const char *)(head->ops + num_ops)); | ||
145 | 136 | ||
146 | if (req->r_reassert_version.epoch) | 137 | if (req->r_reassert_version.epoch) |
147 | seq_printf(s, "\t%u'%llu", | 138 | seq_printf(s, "\t%u'%llu", |
@@ -150,10 +141,9 @@ static int osdc_show(struct seq_file *s, void *pp) | |||
150 | else | 141 | else |
151 | seq_printf(s, "\t"); | 142 | seq_printf(s, "\t"); |
152 | 143 | ||
153 | for (i = 0; i < num_ops; i++) { | 144 | for (i = 0; i < req->r_num_ops; i++) { |
154 | opcode = le16_to_cpu(op->op); | 145 | opcode = le16_to_cpu(req->r_request_ops[i].op); |
155 | seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); | 146 | seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); |
156 | op++; | ||
157 | } | 147 | } |
158 | 148 | ||
159 | seq_printf(s, "\n"); | 149 | seq_printf(s, "\n"); |
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 5584f0a08e28..d730dd4d8eb2 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c | |||
@@ -146,15 +146,23 @@ EXPORT_SYMBOL(ceph_osdc_release_request); | |||
146 | 146 | ||
147 | struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | 147 | struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, |
148 | struct ceph_snap_context *snapc, | 148 | struct ceph_snap_context *snapc, |
149 | unsigned int num_op, | 149 | unsigned int num_ops, |
150 | bool use_mempool, | 150 | bool use_mempool, |
151 | gfp_t gfp_flags) | 151 | gfp_t gfp_flags) |
152 | { | 152 | { |
153 | struct ceph_osd_request *req; | 153 | struct ceph_osd_request *req; |
154 | struct ceph_msg *msg; | 154 | struct ceph_msg *msg; |
155 | size_t msg_size = sizeof(struct ceph_osd_request_head); | 155 | size_t msg_size; |
156 | 156 | ||
157 | msg_size += num_op*sizeof(struct ceph_osd_op); | 157 | msg_size = 4 + 4 + 8 + 8 + 4+8; |
158 | msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ | ||
159 | msg_size += 1 + 8 + 4 + 4; /* pg_t */ | ||
160 | msg_size += 4 + MAX_OBJ_NAME_SIZE; | ||
161 | msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); | ||
162 | msg_size += 8; /* snapid */ | ||
163 | msg_size += 8; /* snap_seq */ | ||
164 | msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ | ||
165 | msg_size += 4; | ||
158 | 166 | ||
159 | if (use_mempool) { | 167 | if (use_mempool) { |
160 | req = mempool_alloc(osdc->req_mempool, gfp_flags); | 168 | req = mempool_alloc(osdc->req_mempool, gfp_flags); |
@@ -193,9 +201,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | |||
193 | ceph_pagelist_init(&req->r_trail); | 201 | ceph_pagelist_init(&req->r_trail); |
194 | 202 | ||
195 | /* create request message; allow space for oid */ | 203 | /* create request message; allow space for oid */ |
196 | msg_size += MAX_OBJ_NAME_SIZE; | ||
197 | if (snapc) | ||
198 | msg_size += sizeof(u64) * snapc->num_snaps; | ||
199 | if (use_mempool) | 204 | if (use_mempool) |
200 | msg = ceph_msgpool_get(&osdc->msgpool_op, 0); | 205 | msg = ceph_msgpool_get(&osdc->msgpool_op, 0); |
201 | else | 206 | else |
@@ -324,55 +329,80 @@ static void osd_req_encode_op(struct ceph_osd_request *req, | |||
324 | * | 329 | * |
325 | */ | 330 | */ |
326 | void ceph_osdc_build_request(struct ceph_osd_request *req, | 331 | void ceph_osdc_build_request(struct ceph_osd_request *req, |
327 | u64 off, u64 len, unsigned int num_op, | 332 | u64 off, u64 len, unsigned int num_ops, |
328 | struct ceph_osd_req_op *src_ops, | 333 | struct ceph_osd_req_op *src_ops, |
329 | struct ceph_snap_context *snapc, u64 snap_id, | 334 | struct ceph_snap_context *snapc, u64 snap_id, |
330 | struct timespec *mtime) | 335 | struct timespec *mtime) |
331 | { | 336 | { |
332 | struct ceph_msg *msg = req->r_request; | 337 | struct ceph_msg *msg = req->r_request; |
333 | struct ceph_osd_request_head *head; | ||
334 | struct ceph_osd_req_op *src_op; | 338 | struct ceph_osd_req_op *src_op; |
335 | struct ceph_osd_op *op; | ||
336 | void *p; | 339 | void *p; |
337 | size_t msg_size = sizeof(*head) + num_op*sizeof(*op); | 340 | size_t msg_size; |
338 | int flags = req->r_flags; | 341 | int flags = req->r_flags; |
339 | u64 data_len; | 342 | u64 data_len; |
340 | int i; | 343 | int i; |
341 | 344 | ||
342 | WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); | 345 | req->r_num_ops = num_ops; |
343 | 346 | req->r_snapid = snap_id; | |
344 | head = msg->front.iov_base; | ||
345 | head->snapid = cpu_to_le64(snap_id); | ||
346 | op = (void *)(head + 1); | ||
347 | p = (void *)(op + num_op); | ||
348 | |||
349 | req->r_snapc = ceph_get_snap_context(snapc); | 347 | req->r_snapc = ceph_get_snap_context(snapc); |
350 | 348 | ||
351 | head->client_inc = cpu_to_le32(1); /* always, for now. */ | 349 | /* encode request */ |
352 | head->flags = cpu_to_le32(flags); | 350 | msg->hdr.version = cpu_to_le16(4); |
353 | if (flags & CEPH_OSD_FLAG_WRITE) | ||
354 | ceph_encode_timespec(&head->mtime, mtime); | ||
355 | BUG_ON(num_op > (unsigned int) ((u16) -1)); | ||
356 | head->num_ops = cpu_to_le16(num_op); | ||
357 | 351 | ||
358 | /* fill in oid */ | 352 | p = msg->front.iov_base; |
359 | head->object_len = cpu_to_le32(req->r_oid_len); | 353 | ceph_encode_32(&p, 1); /* client_inc is always 1 */ |
354 | req->r_request_osdmap_epoch = p; | ||
355 | p += 4; | ||
356 | req->r_request_flags = p; | ||
357 | p += 4; | ||
358 | if (req->r_flags & CEPH_OSD_FLAG_WRITE) | ||
359 | ceph_encode_timespec(p, mtime); | ||
360 | p += sizeof(struct ceph_timespec); | ||
361 | req->r_request_reassert_version = p; | ||
362 | p += sizeof(struct ceph_eversion); /* will get filled in */ | ||
363 | |||
364 | /* oloc */ | ||
365 | ceph_encode_8(&p, 4); | ||
366 | ceph_encode_8(&p, 4); | ||
367 | ceph_encode_32(&p, 8 + 4 + 4); | ||
368 | req->r_request_pool = p; | ||
369 | p += 8; | ||
370 | ceph_encode_32(&p, -1); /* preferred */ | ||
371 | ceph_encode_32(&p, 0); /* key len */ | ||
372 | |||
373 | ceph_encode_8(&p, 1); | ||
374 | req->r_request_pgid = p; | ||
375 | p += 8 + 4; | ||
376 | ceph_encode_32(&p, -1); /* preferred */ | ||
377 | |||
378 | /* oid */ | ||
379 | ceph_encode_32(&p, req->r_oid_len); | ||
360 | memcpy(p, req->r_oid, req->r_oid_len); | 380 | memcpy(p, req->r_oid, req->r_oid_len); |
381 | dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); | ||
361 | p += req->r_oid_len; | 382 | p += req->r_oid_len; |
362 | 383 | ||
384 | /* ops */ | ||
385 | ceph_encode_16(&p, num_ops); | ||
363 | src_op = src_ops; | 386 | src_op = src_ops; |
364 | while (num_op--) | 387 | req->r_request_ops = p; |
365 | osd_req_encode_op(req, op++, src_op++); | 388 | for (i = 0; i < num_ops; i++, src_op++) { |
389 | osd_req_encode_op(req, p, src_op); | ||
390 | p += sizeof(struct ceph_osd_op); | ||
391 | } | ||
366 | 392 | ||
367 | if (snapc) { | 393 | /* snaps */ |
368 | head->snap_seq = cpu_to_le64(snapc->seq); | 394 | ceph_encode_64(&p, req->r_snapid); |
369 | head->num_snaps = cpu_to_le32(snapc->num_snaps); | 395 | ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); |
396 | ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); | ||
397 | if (req->r_snapc) { | ||
370 | for (i = 0; i < snapc->num_snaps; i++) { | 398 | for (i = 0; i < snapc->num_snaps; i++) { |
371 | put_unaligned_le64(snapc->snaps[i], p); | 399 | ceph_encode_64(&p, req->r_snapc->snaps[i]); |
372 | p += sizeof(u64); | ||
373 | } | 400 | } |
374 | } | 401 | } |
375 | 402 | ||
403 | req->r_request_attempts = p; | ||
404 | p += 4; | ||
405 | |||
376 | data_len = req->r_trail.length; | 406 | data_len = req->r_trail.length; |
377 | if (flags & CEPH_OSD_FLAG_WRITE) { | 407 | if (flags & CEPH_OSD_FLAG_WRITE) { |
378 | req->r_request->hdr.data_off = cpu_to_le16(off); | 408 | req->r_request->hdr.data_off = cpu_to_le16(off); |
@@ -385,6 +415,9 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, | |||
385 | msg_size = p - msg->front.iov_base; | 415 | msg_size = p - msg->front.iov_base; |
386 | msg->front.iov_len = msg_size; | 416 | msg->front.iov_len = msg_size; |
387 | msg->hdr.front_len = cpu_to_le32(msg_size); | 417 | msg->hdr.front_len = cpu_to_le32(msg_size); |
418 | |||
419 | dout("build_request msg_size was %d num_ops %d\n", (int)msg_size, | ||
420 | num_ops); | ||
388 | return; | 421 | return; |
389 | } | 422 | } |
390 | EXPORT_SYMBOL(ceph_osdc_build_request); | 423 | EXPORT_SYMBOL(ceph_osdc_build_request); |
@@ -991,21 +1024,22 @@ out: | |||
991 | static void __send_request(struct ceph_osd_client *osdc, | 1024 | static void __send_request(struct ceph_osd_client *osdc, |
992 | struct ceph_osd_request *req) | 1025 | struct ceph_osd_request *req) |
993 | { | 1026 | { |
994 | struct ceph_osd_request_head *reqhead; | 1027 | void *p; |
995 | |||
996 | dout("send_request %p tid %llu to osd%d flags %d\n", | ||
997 | req, req->r_tid, req->r_osd->o_osd, req->r_flags); | ||
998 | |||
999 | reqhead = req->r_request->front.iov_base; | ||
1000 | reqhead->snapid = cpu_to_le64(req->r_snapid); | ||
1001 | reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch); | ||
1002 | reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ | ||
1003 | reqhead->reassert_version = req->r_reassert_version; | ||
1004 | 1028 | ||
1005 | reqhead->layout.ol_pgid.ps = cpu_to_le16(req->r_pgid.seed); | 1029 | dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n", |
1006 | reqhead->layout.ol_pgid.pool = cpu_to_le32(req->r_pgid.pool); | 1030 | req, req->r_tid, req->r_osd->o_osd, req->r_flags, |
1007 | reqhead->layout.ol_pgid.preferred = cpu_to_le16(-1); | 1031 | (unsigned long long)req->r_pgid.pool, req->r_pgid.seed); |
1008 | reqhead->layout.ol_stripe_unit = 0; | 1032 | |
1033 | /* fill in message content that changes each time we send it */ | ||
1034 | put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); | ||
1035 | put_unaligned_le32(req->r_flags, req->r_request_flags); | ||
1036 | put_unaligned_le64(req->r_pgid.pool, req->r_request_pool); | ||
1037 | p = req->r_request_pgid; | ||
1038 | ceph_encode_64(&p, req->r_pgid.pool); | ||
1039 | ceph_encode_32(&p, req->r_pgid.seed); | ||
1040 | put_unaligned_le64(1, req->r_request_attempts); /* FIXME */ | ||
1041 | memcpy(req->r_request_reassert_version, &req->r_reassert_version, | ||
1042 | sizeof(req->r_reassert_version)); | ||
1009 | 1043 | ||
1010 | req->r_stamp = jiffies; | 1044 | req->r_stamp = jiffies; |
1011 | list_move_tail(&req->r_req_lru_item, &osdc->req_lru); | 1045 | list_move_tail(&req->r_req_lru_item, &osdc->req_lru); |
@@ -1105,6 +1139,26 @@ static void complete_request(struct ceph_osd_request *req) | |||
1105 | complete_all(&req->r_safe_completion); /* fsync waiter */ | 1139 | complete_all(&req->r_safe_completion); /* fsync waiter */ |
1106 | } | 1140 | } |
1107 | 1141 | ||
1142 | static int __decode_pgid(void **p, void *end, struct ceph_pg *pgid) | ||
1143 | { | ||
1144 | __u8 v; | ||
1145 | |||
1146 | ceph_decode_need(p, end, 1 + 8 + 4 + 4, bad); | ||
1147 | v = ceph_decode_8(p); | ||
1148 | if (v > 1) { | ||
1149 | pr_warning("do not understand pg encoding %d > 1", v); | ||
1150 | return -EINVAL; | ||
1151 | } | ||
1152 | pgid->pool = ceph_decode_64(p); | ||
1153 | pgid->seed = ceph_decode_32(p); | ||
1154 | *p += 4; | ||
1155 | return 0; | ||
1156 | |||
1157 | bad: | ||
1158 | pr_warning("incomplete pg encoding"); | ||
1159 | return -EINVAL; | ||
1160 | } | ||
1161 | |||
1108 | /* | 1162 | /* |
1109 | * handle osd op reply. either call the callback if it is specified, | 1163 | * handle osd op reply. either call the callback if it is specified, |
1110 | * or do the completion to wake up the waiting thread. | 1164 | * or do the completion to wake up the waiting thread. |
@@ -1112,22 +1166,42 @@ static void complete_request(struct ceph_osd_request *req) | |||
1112 | static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, | 1166 | static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, |
1113 | struct ceph_connection *con) | 1167 | struct ceph_connection *con) |
1114 | { | 1168 | { |
1115 | struct ceph_osd_reply_head *rhead = msg->front.iov_base; | 1169 | void *p, *end; |
1116 | struct ceph_osd_request *req; | 1170 | struct ceph_osd_request *req; |
1117 | u64 tid; | 1171 | u64 tid; |
1118 | int numops, object_len, flags; | 1172 | int object_len; |
1173 | int numops, payload_len, flags; | ||
1119 | s32 result; | 1174 | s32 result; |
1175 | s32 retry_attempt; | ||
1176 | struct ceph_pg pg; | ||
1177 | int err; | ||
1178 | u32 reassert_epoch; | ||
1179 | u64 reassert_version; | ||
1180 | u32 osdmap_epoch; | ||
1181 | int i; | ||
1120 | 1182 | ||
1121 | tid = le64_to_cpu(msg->hdr.tid); | 1183 | tid = le64_to_cpu(msg->hdr.tid); |
1122 | if (msg->front.iov_len < sizeof(*rhead)) | 1184 | dout("handle_reply %p tid %llu\n", msg, tid); |
1123 | goto bad; | 1185 | |
1124 | numops = le32_to_cpu(rhead->num_ops); | 1186 | p = msg->front.iov_base; |
1125 | object_len = le32_to_cpu(rhead->object_len); | 1187 | end = p + msg->front.iov_len; |
1126 | result = le32_to_cpu(rhead->result); | 1188 | |
1127 | if (msg->front.iov_len != sizeof(*rhead) + object_len + | 1189 | ceph_decode_need(&p, end, 4, bad); |
1128 | numops * sizeof(struct ceph_osd_op)) | 1190 | object_len = ceph_decode_32(&p); |
1191 | ceph_decode_need(&p, end, object_len, bad); | ||
1192 | p += object_len; | ||
1193 | |||
1194 | err = __decode_pgid(&p, end, &pg); | ||
1195 | if (err) | ||
1129 | goto bad; | 1196 | goto bad; |
1130 | dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); | 1197 | |
1198 | ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad); | ||
1199 | flags = ceph_decode_64(&p); | ||
1200 | result = ceph_decode_32(&p); | ||
1201 | reassert_epoch = ceph_decode_32(&p); | ||
1202 | reassert_version = ceph_decode_64(&p); | ||
1203 | osdmap_epoch = ceph_decode_32(&p); | ||
1204 | |||
1131 | /* lookup */ | 1205 | /* lookup */ |
1132 | mutex_lock(&osdc->request_mutex); | 1206 | mutex_lock(&osdc->request_mutex); |
1133 | req = __lookup_request(osdc, tid); | 1207 | req = __lookup_request(osdc, tid); |
@@ -1137,7 +1211,38 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, | |||
1137 | return; | 1211 | return; |
1138 | } | 1212 | } |
1139 | ceph_osdc_get_request(req); | 1213 | ceph_osdc_get_request(req); |
1140 | flags = le32_to_cpu(rhead->flags); | 1214 | |
1215 | dout("handle_reply %p tid %llu req %p result %d\n", msg, tid, | ||
1216 | req, result); | ||
1217 | |||
1218 | ceph_decode_need(&p, end, 4, bad); | ||
1219 | numops = ceph_decode_32(&p); | ||
1220 | if (numops > CEPH_OSD_MAX_OP) | ||
1221 | goto bad_put; | ||
1222 | if (numops != req->r_num_ops) | ||
1223 | goto bad_put; | ||
1224 | payload_len = 0; | ||
1225 | ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad); | ||
1226 | for (i = 0; i < numops; i++) { | ||
1227 | struct ceph_osd_op *op = p; | ||
1228 | int len; | ||
1229 | |||
1230 | len = le32_to_cpu(op->payload_len); | ||
1231 | req->r_reply_op_len[i] = len; | ||
1232 | dout(" op %d has %d bytes\n", i, len); | ||
1233 | payload_len += len; | ||
1234 | p += sizeof(*op); | ||
1235 | } | ||
1236 | if (payload_len != le32_to_cpu(msg->hdr.data_len)) { | ||
1237 | pr_warning("sum of op payload lens %d != data_len %d", | ||
1238 | payload_len, le32_to_cpu(msg->hdr.data_len)); | ||
1239 | goto bad_put; | ||
1240 | } | ||
1241 | |||
1242 | ceph_decode_need(&p, end, 4 + numops * 4, bad); | ||
1243 | retry_attempt = ceph_decode_32(&p); | ||
1244 | for (i = 0; i < numops; i++) | ||
1245 | req->r_reply_op_result[i] = ceph_decode_32(&p); | ||
1141 | 1246 | ||
1142 | /* | 1247 | /* |
1143 | * if this connection filled our message, drop our reference now, to | 1248 | * if this connection filled our message, drop our reference now, to |
@@ -1152,7 +1257,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, | |||
1152 | if (!req->r_got_reply) { | 1257 | if (!req->r_got_reply) { |
1153 | unsigned int bytes; | 1258 | unsigned int bytes; |
1154 | 1259 | ||
1155 | req->r_result = le32_to_cpu(rhead->result); | 1260 | req->r_result = result; |
1156 | bytes = le32_to_cpu(msg->hdr.data_len); | 1261 | bytes = le32_to_cpu(msg->hdr.data_len); |
1157 | dout("handle_reply result %d bytes %d\n", req->r_result, | 1262 | dout("handle_reply result %d bytes %d\n", req->r_result, |
1158 | bytes); | 1263 | bytes); |
@@ -1160,7 +1265,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, | |||
1160 | req->r_result = bytes; | 1265 | req->r_result = bytes; |
1161 | 1266 | ||
1162 | /* in case this is a write and we need to replay, */ | 1267 | /* in case this is a write and we need to replay, */ |
1163 | req->r_reassert_version = rhead->reassert_version; | 1268 | req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch); |
1269 | req->r_reassert_version.version = cpu_to_le64(reassert_version); | ||
1164 | 1270 | ||
1165 | req->r_got_reply = 1; | 1271 | req->r_got_reply = 1; |
1166 | } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { | 1272 | } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { |
@@ -1195,10 +1301,11 @@ done: | |||
1195 | ceph_osdc_put_request(req); | 1301 | ceph_osdc_put_request(req); |
1196 | return; | 1302 | return; |
1197 | 1303 | ||
1304 | bad_put: | ||
1305 | ceph_osdc_put_request(req); | ||
1198 | bad: | 1306 | bad: |
1199 | pr_err("corrupt osd_op_reply got %d %d expected %d\n", | 1307 | pr_err("corrupt osd_op_reply got %d %d\n", |
1200 | (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len), | 1308 | (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); |
1201 | (int)sizeof(*rhead)); | ||
1202 | ceph_msg_dump(msg); | 1309 | ceph_msg_dump(msg); |
1203 | } | 1310 | } |
1204 | 1311 | ||