aboutsummaryrefslogtreecommitdiffstats
path: root/net/ceph/osd_client.c
diff options
context:
space:
mode:
authorJosh Durgin <josh.durgin@inktank.com>2013-12-10 12:35:13 -0500
committerIlya Dryomov <ilya.dryomov@inktank.com>2013-12-13 16:04:28 -0500
commit9a1ea2dbff11547a8e664f143c1ffefc586a577a (patch)
tree54535e0308abc52c7f6a471e4ca9b1d72274e489 /net/ceph/osd_client.c
parentd29adb34a94715174c88ca93e8aba955850c9bde (diff)
libceph: resend all writes after the osdmap loses the full flag
With the current full handling, there is a race between osds and clients getting the first map marked full. If the osd wins, it will return -ENOSPC to any writes, but the client may already have writes in flight. This results in the client getting the error and propagating it up the stack. For rbd, the block layer turns this into EIO, which can cause corruption in filesystems above it. To avoid this race, osds are being changed to drop writes that came from clients with an osdmap older than the last osdmap marked full. In order for this to work, clients must resend all writes after they encounter a full -> not full transition in the osdmap. osds will wait for an updated map instead of processing a request from a client with a newer map, so resent writes will not be dropped by the osd unless there is another not full -> full transition. This approach requires both osds and clients to be fixed to avoid the race. Old clients talking to osds with this fix may hang instead of returning EIO and potentially corrupting an fs. New clients talking to old osds have the same behavior as before if they encounter this race. Fixes: http://tracker.ceph.com/issues/6938 Reviewed-by: Sage Weil <sage@inktank.com> Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
Diffstat (limited to 'net/ceph/osd_client.c')
-rw-r--r--net/ceph/osd_client.c28
1 files changed, 22 insertions, 6 deletions
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 1ad9866dc707..9f1993582ff7 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1643,14 +1643,17 @@ static void reset_changed_osds(struct ceph_osd_client *osdc)
1643 * 1643 *
1644 * Caller should hold map_sem for read. 1644 * Caller should hold map_sem for read.
1645 */ 1645 */
1646static void kick_requests(struct ceph_osd_client *osdc, int force_resend) 1646static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
1647 bool force_resend_writes)
1647{ 1648{
1648 struct ceph_osd_request *req, *nreq; 1649 struct ceph_osd_request *req, *nreq;
1649 struct rb_node *p; 1650 struct rb_node *p;
1650 int needmap = 0; 1651 int needmap = 0;
1651 int err; 1652 int err;
1653 bool force_resend_req;
1652 1654
1653 dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); 1655 dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
1656 force_resend_writes ? " (force resend writes)" : "");
1654 mutex_lock(&osdc->request_mutex); 1657 mutex_lock(&osdc->request_mutex);
1655 for (p = rb_first(&osdc->requests); p; ) { 1658 for (p = rb_first(&osdc->requests); p; ) {
1656 req = rb_entry(p, struct ceph_osd_request, r_node); 1659 req = rb_entry(p, struct ceph_osd_request, r_node);
@@ -1675,7 +1678,10 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
1675 continue; 1678 continue;
1676 } 1679 }
1677 1680
1678 err = __map_request(osdc, req, force_resend); 1681 force_resend_req = force_resend ||
1682 (force_resend_writes &&
1683 req->r_flags & CEPH_OSD_FLAG_WRITE);
1684 err = __map_request(osdc, req, force_resend_req);
1679 if (err < 0) 1685 if (err < 0)
1680 continue; /* error */ 1686 continue; /* error */
1681 if (req->r_osd == NULL) { 1687 if (req->r_osd == NULL) {
@@ -1695,7 +1701,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
1695 r_linger_item) { 1701 r_linger_item) {
1696 dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); 1702 dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
1697 1703
1698 err = __map_request(osdc, req, force_resend); 1704 err = __map_request(osdc, req,
1705 force_resend || force_resend_writes);
1699 dout("__map_request returned %d\n", err); 1706 dout("__map_request returned %d\n", err);
1700 if (err == 0) 1707 if (err == 0)
1701 continue; /* no change and no osd was specified */ 1708 continue; /* no change and no osd was specified */
@@ -1737,6 +1744,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1737 struct ceph_osdmap *newmap = NULL, *oldmap; 1744 struct ceph_osdmap *newmap = NULL, *oldmap;
1738 int err; 1745 int err;
1739 struct ceph_fsid fsid; 1746 struct ceph_fsid fsid;
1747 bool was_full;
1740 1748
1741 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); 1749 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
1742 p = msg->front.iov_base; 1750 p = msg->front.iov_base;
@@ -1750,6 +1758,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1750 1758
1751 down_write(&osdc->map_sem); 1759 down_write(&osdc->map_sem);
1752 1760
1761 was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
1762
1753 /* incremental maps */ 1763 /* incremental maps */
1754 ceph_decode_32_safe(&p, end, nr_maps, bad); 1764 ceph_decode_32_safe(&p, end, nr_maps, bad);
1755 dout(" %d inc maps\n", nr_maps); 1765 dout(" %d inc maps\n", nr_maps);
@@ -1774,7 +1784,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1774 ceph_osdmap_destroy(osdc->osdmap); 1784 ceph_osdmap_destroy(osdc->osdmap);
1775 osdc->osdmap = newmap; 1785 osdc->osdmap = newmap;
1776 } 1786 }
1777 kick_requests(osdc, 0); 1787 was_full = was_full ||
1788 ceph_osdmap_flag(osdc->osdmap,
1789 CEPH_OSDMAP_FULL);
1790 kick_requests(osdc, 0, was_full);
1778 } else { 1791 } else {
1779 dout("ignoring incremental map %u len %d\n", 1792 dout("ignoring incremental map %u len %d\n",
1780 epoch, maplen); 1793 epoch, maplen);
@@ -1817,7 +1830,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1817 skipped_map = 1; 1830 skipped_map = 1;
1818 ceph_osdmap_destroy(oldmap); 1831 ceph_osdmap_destroy(oldmap);
1819 } 1832 }
1820 kick_requests(osdc, skipped_map); 1833 was_full = was_full ||
1834 ceph_osdmap_flag(osdc->osdmap,
1835 CEPH_OSDMAP_FULL);
1836 kick_requests(osdc, skipped_map, was_full);
1821 } 1837 }
1822 p += maplen; 1838 p += maplen;
1823 nr_maps--; 1839 nr_maps--;