aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
authorAlex Elder <elder@inktank.com>2013-04-15 12:20:42 -0400
committerSage Weil <sage@inktank.com>2013-05-02 00:18:52 -0400
commit26be88087ae8a04a5b576aa2f490597b649fc132 (patch)
treede16ba5b5b9fe678546fe82f0d6801a2ef441b0d /fs/ceph
parent7d7d51ce14fde491a6d0677d9bded9b3bd0d21d9 (diff)
libceph: change how "safe" callback is used
An osd request currently has two callbacks. They inform the initiator of the request when we've received confirmation for the target osd that a request was received, and when the osd indicates all changes described by the request are durable. The only time the second callback is used is in the ceph file system for a synchronous write. There's a race that makes some handling of this case unsafe. This patch addresses this problem. The error handling for this callback is also kind of gross, and this patch changes that as well. In ceph_sync_write(), if a safe callback is requested we want to add the request on the ceph inode's unsafe items list. Because items on this list must have their tid set (by ceph_osd_start_request()), the request added *after* the call to that function returns. The problem with this is that there's a race between starting the request and adding it to the unsafe items list; the request may already be complete before ceph_sync_write() even begins to put it on the list. To address this, we change the way the "safe" callback is used. Rather than just calling it when the request is "safe", we use it to notify the initiator the bounds (start and end) of the period during which the request is *unsafe*. So the initiator gets notified just before the request gets sent to the osd (when it is "unsafe"), and again when it's known the results are durable (it's no longer unsafe). The first call will get made in __send_request(), just before the request message gets sent to the messenger for the first time. That function is only called by __send_queued(), which is always called with the osd client's request mutex held. We then have this callback function insert the request on the ceph inode's unsafe list when we're told the request is unsafe. This will avoid the race because this call will be made under protection of the osd client's request mutex. It also nicely groups the setup and cleanup of the state associated with managing unsafe requests. The name of the "safe" callback field is changed to "unsafe" to better reflect its new purpose. It has a Boolean "unsafe" parameter to indicate whether the request is becoming unsafe or is now safe. Because the "msg" parameter wasn't used, we drop that. This resolves the original problem reportedin: http://tracker.ceph.com/issues/4706 Reported-by: Yan, Zheng <zheng.z.yan@intel.com> Signed-off-by: Alex Elder <elder@inktank.com> Reviewed-by: Yan, Zheng <zheng.z.yan@intel.com> Reviewed-by: Sage Weil <sage@inktank.com>
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/file.c52
1 files changed, 28 insertions, 24 deletions
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ae23e31a8f38..a65acf355384 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -446,19 +446,35 @@ done:
446} 446}
447 447
448/* 448/*
449 * Write commit callback, called if we requested both an ACK and 449 * Write commit request unsafe callback, called to tell us when a
450 * ONDISK commit reply from the OSD. 450 * request is unsafe (that is, in flight--has been handed to the
451 * messenger to send to its target osd). It is called again when
452 * we've received a response message indicating the request is
453 * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request
454 * is completed early (and unsuccessfully) due to a timeout or
455 * interrupt.
456 *
457 * This is used if we requested both an ACK and ONDISK commit reply
458 * from the OSD.
451 */ 459 */
452static void sync_write_commit(struct ceph_osd_request *req, 460static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
453 struct ceph_msg *msg)
454{ 461{
455 struct ceph_inode_info *ci = ceph_inode(req->r_inode); 462 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
456 463
457 dout("sync_write_commit %p tid %llu\n", req, req->r_tid); 464 dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid,
458 spin_lock(&ci->i_unsafe_lock); 465 unsafe ? "un" : "");
459 list_del_init(&req->r_unsafe_item); 466 if (unsafe) {
460 spin_unlock(&ci->i_unsafe_lock); 467 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
461 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR); 468 spin_lock(&ci->i_unsafe_lock);
469 list_add_tail(&req->r_unsafe_item,
470 &ci->i_unsafe_writes);
471 spin_unlock(&ci->i_unsafe_lock);
472 } else {
473 spin_lock(&ci->i_unsafe_lock);
474 list_del_init(&req->r_unsafe_item);
475 spin_unlock(&ci->i_unsafe_lock);
476 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
477 }
462} 478}
463 479
464/* 480/*
@@ -570,7 +586,8 @@ more:
570 586
571 if ((file->f_flags & O_SYNC) == 0) { 587 if ((file->f_flags & O_SYNC) == 0) {
572 /* get a second commit callback */ 588 /* get a second commit callback */
573 req->r_safe_callback = sync_write_commit; 589 req->r_unsafe_callback = ceph_sync_write_unsafe;
590 req->r_inode = inode;
574 own_pages = true; 591 own_pages = true;
575 } 592 }
576 } 593 }
@@ -581,21 +598,8 @@ more:
581 ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); 598 ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
582 599
583 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 600 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
584 if (!ret) { 601 if (!ret)
585 if (req->r_safe_callback) {
586 /*
587 * Add to inode unsafe list only after we
588 * start_request so that a tid has been assigned.
589 */
590 spin_lock(&ci->i_unsafe_lock);
591 list_add_tail(&req->r_unsafe_item,
592 &ci->i_unsafe_writes);
593 spin_unlock(&ci->i_unsafe_lock);
594 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
595 }
596
597 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 602 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
598 }
599 603
600 if (file->f_flags & O_DIRECT) 604 if (file->f_flags & O_DIRECT)
601 ceph_put_page_vector(pages, num_pages, false); 605 ceph_put_page_vector(pages, num_pages, false);