aboutsummaryrefslogtreecommitdiffstats
path: root/net/ceph/osd_client.c
diff options
context:
space:
mode:
authorAlex Elder <elder@inktank.com>2013-03-25 19:16:11 -0400
committerSage Weil <sage@inktank.com>2013-05-02 00:17:18 -0400
commite02493c07c4cb08106d0b3a4b5003c7c005010fb (patch)
tree25e288421a2758bb0364f65d43d96a501a1cca74 /net/ceph/osd_client.c
parent92451b4910895936cc05ce1d283644ffc44d7537 (diff)
libceph: requeue only sent requests when kicking
The osd expects incoming requests for a given object from a given client to arrive in order, with the tid for each request being greater than the tid for requests that have already arrived. This patch fixes two places the osd client might not maintain that ordering. For the osd client, the connection fault method is osd_reset(). That function calls __reset_osd() to close and re-open the connection, then calls __kick_osd_requests() to cause all outstanding requests for the affected osd to be re-sent after the connection has been re-established. When an osd is reset, any in-flight messages will need to be re-sent. An osd client maintains distinct lists for unsent and in-flight messages. Meanwhile, an osd maintains a single list of all its requests (both sent and un-sent). (Each message is linked into two lists--one for the osd client and one list for the osd.) To process an osd "kick" operation, the request list for the *osd* is traversed, and each request is moved off whichever osd *client* list it was on (unsent or sent) and placed onto the osd client's unsent list. (It remains where it is on the osd's request list.) When that is done, osd_reset() calls __send_queued() to cause each of the osd client's unsent messages to be sent. OK, with that background... As the osd request list is traversed each request is prepended to the osd client's unsent list in the order they're seen. The effect of this is to reverse the order of these requests as they are put (back) onto the unsent list. Instead, build up a list of only the requests for an osd that have already been sent (by checking their r_sent flag values). Once an unsent request is found, stop examining requests and prepend the requests that need re-sending to the osd client's unsent list. Preserve the original order of requests in the process (previously re-queued requests were reversed in this process). Because they have already been sent, they will have lower tids than any request already present on the unsent list. Just below that, traverse the linger list in forward order as before, but add them to the *tail* of the list rather than the head. These requests get re-registered, and in the process are give a new (higher) tid, so the should go at the end. This partially resolves: http://tracker.ceph.com/issues/4392 Signed-off-by: Alex Elder <elder@inktank.com> Reviewed-off-by: Sage Weil <sage@inktank.com>
Diffstat (limited to 'net/ceph/osd_client.c')
-rw-r--r--net/ceph/osd_client.c33
1 files changed, 29 insertions, 4 deletions
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 3723a7f16afd..8b84fb4980ba 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -570,21 +570,46 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
570 struct ceph_osd *osd) 570 struct ceph_osd *osd)
571{ 571{
572 struct ceph_osd_request *req, *nreq; 572 struct ceph_osd_request *req, *nreq;
573 LIST_HEAD(resend);
573 int err; 574 int err;
574 575
575 dout("__kick_osd_requests osd%d\n", osd->o_osd); 576 dout("__kick_osd_requests osd%d\n", osd->o_osd);
576 err = __reset_osd(osdc, osd); 577 err = __reset_osd(osdc, osd);
577 if (err) 578 if (err)
578 return; 579 return;
579 580 /*
581 * Build up a list of requests to resend by traversing the
582 * osd's list of requests. Requests for a given object are
583 * sent in tid order, and that is also the order they're
584 * kept on this list. Therefore all requests that are in
585 * flight will be found first, followed by all requests that
586 * have not yet been sent. And to resend requests while
587 * preserving this order we will want to put any sent
588 * requests back on the front of the osd client's unsent
589 * list.
590 *
591 * So we build a separate ordered list of already-sent
592 * requests for the affected osd and splice it onto the
593 * front of the osd client's unsent list. Once we've seen a
594 * request that has not yet been sent we're done. Those
595 * requests are already sitting right where they belong.
596 */
580 list_for_each_entry(req, &osd->o_requests, r_osd_item) { 597 list_for_each_entry(req, &osd->o_requests, r_osd_item) {
581 list_move(&req->r_req_lru_item, &osdc->req_unsent); 598 if (!req->r_sent)
582 dout("requeued %p tid %llu osd%d\n", req, req->r_tid, 599 break;
600 list_move_tail(&req->r_req_lru_item, &resend);
601 dout("requeueing %p tid %llu osd%d\n", req, req->r_tid,
583 osd->o_osd); 602 osd->o_osd);
584 if (!req->r_linger) 603 if (!req->r_linger)
585 req->r_flags |= CEPH_OSD_FLAG_RETRY; 604 req->r_flags |= CEPH_OSD_FLAG_RETRY;
586 } 605 }
606 list_splice(&resend, &osdc->req_unsent);
587 607
608 /*
609 * Linger requests are re-registered before sending, which
610 * sets up a new tid for each. We add them to the unsent
611 * list at the end to keep things in tid order.
612 */
588 list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, 613 list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
589 r_linger_osd) { 614 r_linger_osd) {
590 /* 615 /*
@@ -593,7 +618,7 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
593 */ 618 */
594 BUG_ON(!list_empty(&req->r_req_lru_item)); 619 BUG_ON(!list_empty(&req->r_req_lru_item));
595 __register_request(osdc, req); 620 __register_request(osdc, req);
596 list_add(&req->r_req_lru_item, &osdc->req_unsent); 621 list_add_tail(&req->r_req_lru_item, &osdc->req_unsent);
597 list_add(&req->r_osd_item, &req->r_osd->o_requests); 622 list_add(&req->r_osd_item, &req->r_osd->o_requests);
598 __unregister_linger_request(osdc, req); 623 __unregister_linger_request(osdc, req);
599 dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, 624 dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid,