diff options
author | Ilya Dryomov <idryomov@gmail.com> | 2016-04-28 10:07:22 -0400 |
---|---|---|
committer | Ilya Dryomov <idryomov@gmail.com> | 2016-05-25 18:36:25 -0400 |
commit | 6f3bfd45cd233eea0b07e3cabc0386b5de9321d2 (patch) | |
tree | cda9593b00d971b10ebeb9279ad1893978236df8 /net | |
parent | d9591f5e28686277d9312d3c7422faf1368b305e (diff) |
libceph: ceph_osds, ceph_pg_to_up_acting_osds()
Knowning just acting set isn't enough, we need to be able to record up
set as well to detect interval changes. This means returning (up[],
up_len, up_primary, acting[], acting_len, acting_primary) and passing
it around. Introduce and switch to ceph_osds to help with that.
Rename ceph_calc_pg_acting() to ceph_pg_to_up_acting_osds() and return
both up and acting sets from it.
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Diffstat (limited to 'net')
-rw-r--r-- | net/ceph/osd_client.c | 36 | ||||
-rw-r--r-- | net/ceph/osdmap.c | 304 |
2 files changed, 197 insertions, 143 deletions
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index cb9f1953f5fb..0ff400a56cd6 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c | |||
@@ -1358,8 +1358,7 @@ static int __map_request(struct ceph_osd_client *osdc, | |||
1358 | struct ceph_osd_request *req, int force_resend) | 1358 | struct ceph_osd_request *req, int force_resend) |
1359 | { | 1359 | { |
1360 | struct ceph_pg pgid; | 1360 | struct ceph_pg pgid; |
1361 | int acting[CEPH_PG_MAX_SIZE]; | 1361 | struct ceph_osds up, acting; |
1362 | int num, o; | ||
1363 | int err; | 1362 | int err; |
1364 | bool was_paused; | 1363 | bool was_paused; |
1365 | 1364 | ||
@@ -1372,9 +1371,7 @@ static int __map_request(struct ceph_osd_client *osdc, | |||
1372 | } | 1371 | } |
1373 | req->r_pgid = pgid; | 1372 | req->r_pgid = pgid; |
1374 | 1373 | ||
1375 | num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o); | 1374 | ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting); |
1376 | if (num < 0) | ||
1377 | num = 0; | ||
1378 | 1375 | ||
1379 | was_paused = req->r_paused; | 1376 | was_paused = req->r_paused; |
1380 | req->r_paused = __req_should_be_paused(osdc, req); | 1377 | req->r_paused = __req_should_be_paused(osdc, req); |
@@ -1382,21 +1379,23 @@ static int __map_request(struct ceph_osd_client *osdc, | |||
1382 | force_resend = 1; | 1379 | force_resend = 1; |
1383 | 1380 | ||
1384 | if ((!force_resend && | 1381 | if ((!force_resend && |
1385 | req->r_osd && req->r_osd->o_osd == o && | 1382 | req->r_osd && req->r_osd->o_osd == acting.primary && |
1386 | req->r_sent >= req->r_osd->o_incarnation && | 1383 | req->r_sent >= req->r_osd->o_incarnation && |
1387 | req->r_num_pg_osds == num && | 1384 | req->r_num_pg_osds == acting.size && |
1388 | memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || | 1385 | memcmp(req->r_pg_osds, acting.osds, |
1389 | (req->r_osd == NULL && o == -1) || | 1386 | acting.size * sizeof(acting.osds[0])) == 0) || |
1387 | (req->r_osd == NULL && acting.primary == -1) || | ||
1390 | req->r_paused) | 1388 | req->r_paused) |
1391 | return 0; /* no change */ | 1389 | return 0; /* no change */ |
1392 | 1390 | ||
1393 | dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", | 1391 | dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", |
1394 | req->r_tid, pgid.pool, pgid.seed, o, | 1392 | req->r_tid, pgid.pool, pgid.seed, acting.primary, |
1395 | req->r_osd ? req->r_osd->o_osd : -1); | 1393 | req->r_osd ? req->r_osd->o_osd : -1); |
1396 | 1394 | ||
1397 | /* record full pg acting set */ | 1395 | /* record full pg acting set */ |
1398 | memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num); | 1396 | memcpy(req->r_pg_osds, acting.osds, |
1399 | req->r_num_pg_osds = num; | 1397 | acting.size * sizeof(acting.osds[0])); |
1398 | req->r_num_pg_osds = acting.size; | ||
1400 | 1399 | ||
1401 | if (req->r_osd) { | 1400 | if (req->r_osd) { |
1402 | __cancel_request(req); | 1401 | __cancel_request(req); |
@@ -1405,21 +1404,22 @@ static int __map_request(struct ceph_osd_client *osdc, | |||
1405 | req->r_osd = NULL; | 1404 | req->r_osd = NULL; |
1406 | } | 1405 | } |
1407 | 1406 | ||
1408 | req->r_osd = lookup_osd(&osdc->osds, o); | 1407 | req->r_osd = lookup_osd(&osdc->osds, acting.primary); |
1409 | if (!req->r_osd && o >= 0) { | 1408 | if (!req->r_osd && acting.primary >= 0) { |
1410 | err = -ENOMEM; | 1409 | err = -ENOMEM; |
1411 | req->r_osd = create_osd(osdc, o); | 1410 | req->r_osd = create_osd(osdc, acting.primary); |
1412 | if (!req->r_osd) { | 1411 | if (!req->r_osd) { |
1413 | list_move(&req->r_req_lru_item, &osdc->req_notarget); | 1412 | list_move(&req->r_req_lru_item, &osdc->req_notarget); |
1414 | goto out; | 1413 | goto out; |
1415 | } | 1414 | } |
1416 | 1415 | ||
1417 | dout("map_request osd %p is osd%d\n", req->r_osd, o); | 1416 | dout("map_request osd %p is osd%d\n", req->r_osd, |
1417 | acting.primary); | ||
1418 | insert_osd(&osdc->osds, req->r_osd); | 1418 | insert_osd(&osdc->osds, req->r_osd); |
1419 | 1419 | ||
1420 | ceph_con_open(&req->r_osd->o_con, | 1420 | ceph_con_open(&req->r_osd->o_con, |
1421 | CEPH_ENTITY_TYPE_OSD, o, | 1421 | CEPH_ENTITY_TYPE_OSD, acting.primary, |
1422 | &osdc->osdmap->osd_addr[o]); | 1422 | &osdc->osdmap->osd_addr[acting.primary]); |
1423 | } | 1423 | } |
1424 | 1424 | ||
1425 | __enqueue_request(req); | 1425 | __enqueue_request(req); |
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 6267839cb246..f5fc8fc63879 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c | |||
@@ -1474,6 +1474,38 @@ void ceph_oid_destroy(struct ceph_object_id *oid) | |||
1474 | } | 1474 | } |
1475 | EXPORT_SYMBOL(ceph_oid_destroy); | 1475 | EXPORT_SYMBOL(ceph_oid_destroy); |
1476 | 1476 | ||
1477 | static bool osds_valid(const struct ceph_osds *set) | ||
1478 | { | ||
1479 | /* non-empty set */ | ||
1480 | if (set->size > 0 && set->primary >= 0) | ||
1481 | return true; | ||
1482 | |||
1483 | /* empty can_shift_osds set */ | ||
1484 | if (!set->size && set->primary == -1) | ||
1485 | return true; | ||
1486 | |||
1487 | /* empty !can_shift_osds set - all NONE */ | ||
1488 | if (set->size > 0 && set->primary == -1) { | ||
1489 | int i; | ||
1490 | |||
1491 | for (i = 0; i < set->size; i++) { | ||
1492 | if (set->osds[i] != CRUSH_ITEM_NONE) | ||
1493 | break; | ||
1494 | } | ||
1495 | if (i == set->size) | ||
1496 | return true; | ||
1497 | } | ||
1498 | |||
1499 | return false; | ||
1500 | } | ||
1501 | |||
1502 | void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src) | ||
1503 | { | ||
1504 | memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0])); | ||
1505 | dest->size = src->size; | ||
1506 | dest->primary = src->primary; | ||
1507 | } | ||
1508 | |||
1477 | /* | 1509 | /* |
1478 | * calculate file layout from given offset, length. | 1510 | * calculate file layout from given offset, length. |
1479 | * fill in correct oid, logical length, and object extent | 1511 | * fill in correct oid, logical length, and object extent |
@@ -1571,6 +1603,46 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, | |||
1571 | } | 1603 | } |
1572 | EXPORT_SYMBOL(ceph_object_locator_to_pg); | 1604 | EXPORT_SYMBOL(ceph_object_locator_to_pg); |
1573 | 1605 | ||
1606 | /* | ||
1607 | * Map a raw PG (full precision ps) into an actual PG. | ||
1608 | */ | ||
1609 | static void raw_pg_to_pg(struct ceph_pg_pool_info *pi, | ||
1610 | const struct ceph_pg *raw_pgid, | ||
1611 | struct ceph_pg *pgid) | ||
1612 | { | ||
1613 | pgid->pool = raw_pgid->pool; | ||
1614 | pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num, | ||
1615 | pi->pg_num_mask); | ||
1616 | } | ||
1617 | |||
1618 | /* | ||
1619 | * Map a raw PG (full precision ps) into a placement ps (placement | ||
1620 | * seed). Include pool id in that value so that different pools don't | ||
1621 | * use the same seeds. | ||
1622 | */ | ||
1623 | static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi, | ||
1624 | const struct ceph_pg *raw_pgid) | ||
1625 | { | ||
1626 | if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) { | ||
1627 | /* hash pool id and seed so that pool PGs do not overlap */ | ||
1628 | return crush_hash32_2(CRUSH_HASH_RJENKINS1, | ||
1629 | ceph_stable_mod(raw_pgid->seed, | ||
1630 | pi->pgp_num, | ||
1631 | pi->pgp_num_mask), | ||
1632 | raw_pgid->pool); | ||
1633 | } else { | ||
1634 | /* | ||
1635 | * legacy behavior: add ps and pool together. this is | ||
1636 | * not a great approach because the PGs from each pool | ||
1637 | * will overlap on top of each other: 0.5 == 1.4 == | ||
1638 | * 2.3 == ... | ||
1639 | */ | ||
1640 | return ceph_stable_mod(raw_pgid->seed, pi->pgp_num, | ||
1641 | pi->pgp_num_mask) + | ||
1642 | (unsigned)raw_pgid->pool; | ||
1643 | } | ||
1644 | } | ||
1645 | |||
1574 | static int do_crush(struct ceph_osdmap *map, int ruleno, int x, | 1646 | static int do_crush(struct ceph_osdmap *map, int ruleno, int x, |
1575 | int *result, int result_max, | 1647 | int *result, int result_max, |
1576 | const __u32 *weight, int weight_max) | 1648 | const __u32 *weight, int weight_max) |
@@ -1588,84 +1660,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, | |||
1588 | } | 1660 | } |
1589 | 1661 | ||
1590 | /* | 1662 | /* |
1591 | * Calculate raw (crush) set for given pgid. | 1663 | * Calculate raw set (CRUSH output) for given PG. The result may |
1664 | * contain nonexistent OSDs. ->primary is undefined for a raw set. | ||
1592 | * | 1665 | * |
1593 | * Return raw set length, or error. | 1666 | * Placement seed (CRUSH input) is returned through @ppps. |
1594 | */ | 1667 | */ |
1595 | static int pg_to_raw_osds(struct ceph_osdmap *osdmap, | 1668 | static void pg_to_raw_osds(struct ceph_osdmap *osdmap, |
1596 | struct ceph_pg_pool_info *pool, | 1669 | struct ceph_pg_pool_info *pi, |
1597 | struct ceph_pg pgid, u32 pps, int *osds) | 1670 | const struct ceph_pg *raw_pgid, |
1671 | struct ceph_osds *raw, | ||
1672 | u32 *ppps) | ||
1598 | { | 1673 | { |
1674 | u32 pps = raw_pg_to_pps(pi, raw_pgid); | ||
1599 | int ruleno; | 1675 | int ruleno; |
1600 | int len; | 1676 | int len; |
1601 | 1677 | ||
1602 | /* crush */ | 1678 | ceph_osds_init(raw); |
1603 | ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, | 1679 | if (ppps) |
1604 | pool->type, pool->size); | 1680 | *ppps = pps; |
1681 | |||
1682 | ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type, | ||
1683 | pi->size); | ||
1605 | if (ruleno < 0) { | 1684 | if (ruleno < 0) { |
1606 | pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", | 1685 | pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", |
1607 | pgid.pool, pool->crush_ruleset, pool->type, | 1686 | pi->id, pi->crush_ruleset, pi->type, pi->size); |
1608 | pool->size); | 1687 | return; |
1609 | return -ENOENT; | ||
1610 | } | 1688 | } |
1611 | 1689 | ||
1612 | len = do_crush(osdmap, ruleno, pps, osds, | 1690 | len = do_crush(osdmap, ruleno, pps, raw->osds, |
1613 | min_t(int, pool->size, CEPH_PG_MAX_SIZE), | 1691 | min_t(int, pi->size, ARRAY_SIZE(raw->osds)), |
1614 | osdmap->osd_weight, osdmap->max_osd); | 1692 | osdmap->osd_weight, osdmap->max_osd); |
1615 | if (len < 0) { | 1693 | if (len < 0) { |
1616 | pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", | 1694 | pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", |
1617 | len, ruleno, pgid.pool, pool->crush_ruleset, | 1695 | len, ruleno, pi->id, pi->crush_ruleset, pi->type, |
1618 | pool->type, pool->size); | 1696 | pi->size); |
1619 | return len; | 1697 | return; |
1620 | } | 1698 | } |
1621 | 1699 | ||
1622 | return len; | 1700 | raw->size = len; |
1623 | } | 1701 | } |
1624 | 1702 | ||
1625 | /* | 1703 | /* |
1626 | * Given raw set, calculate up set and up primary. | 1704 | * Given raw set, calculate up set and up primary. By definition of an |
1705 | * up set, the result won't contain nonexistent or down OSDs. | ||
1627 | * | 1706 | * |
1628 | * Return up set length. *primary is set to up primary osd id, or -1 | 1707 | * This is done in-place - on return @set is the up set. If it's |
1629 | * if up set is empty. | 1708 | * empty, ->primary will remain undefined. |
1630 | */ | 1709 | */ |
1631 | static int raw_to_up_osds(struct ceph_osdmap *osdmap, | 1710 | static void raw_to_up_osds(struct ceph_osdmap *osdmap, |
1632 | struct ceph_pg_pool_info *pool, | 1711 | struct ceph_pg_pool_info *pi, |
1633 | int *osds, int len, int *primary) | 1712 | struct ceph_osds *set) |
1634 | { | 1713 | { |
1635 | int up_primary = -1; | ||
1636 | int i; | 1714 | int i; |
1637 | 1715 | ||
1638 | if (ceph_can_shift_osds(pool)) { | 1716 | /* ->primary is undefined for a raw set */ |
1717 | BUG_ON(set->primary != -1); | ||
1718 | |||
1719 | if (ceph_can_shift_osds(pi)) { | ||
1639 | int removed = 0; | 1720 | int removed = 0; |
1640 | 1721 | ||
1641 | for (i = 0; i < len; i++) { | 1722 | /* shift left */ |
1642 | if (ceph_osd_is_down(osdmap, osds[i])) { | 1723 | for (i = 0; i < set->size; i++) { |
1724 | if (ceph_osd_is_down(osdmap, set->osds[i])) { | ||
1643 | removed++; | 1725 | removed++; |
1644 | continue; | 1726 | continue; |
1645 | } | 1727 | } |
1646 | if (removed) | 1728 | if (removed) |
1647 | osds[i - removed] = osds[i]; | 1729 | set->osds[i - removed] = set->osds[i]; |
1648 | } | 1730 | } |
1649 | 1731 | set->size -= removed; | |
1650 | len -= removed; | 1732 | if (set->size > 0) |
1651 | if (len > 0) | 1733 | set->primary = set->osds[0]; |
1652 | up_primary = osds[0]; | ||
1653 | } else { | 1734 | } else { |
1654 | for (i = len - 1; i >= 0; i--) { | 1735 | /* set down/dne devices to NONE */ |
1655 | if (ceph_osd_is_down(osdmap, osds[i])) | 1736 | for (i = set->size - 1; i >= 0; i--) { |
1656 | osds[i] = CRUSH_ITEM_NONE; | 1737 | if (ceph_osd_is_down(osdmap, set->osds[i])) |
1738 | set->osds[i] = CRUSH_ITEM_NONE; | ||
1657 | else | 1739 | else |
1658 | up_primary = osds[i]; | 1740 | set->primary = set->osds[i]; |
1659 | } | 1741 | } |
1660 | } | 1742 | } |
1661 | |||
1662 | *primary = up_primary; | ||
1663 | return len; | ||
1664 | } | 1743 | } |
1665 | 1744 | ||
1666 | static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, | 1745 | static void apply_primary_affinity(struct ceph_osdmap *osdmap, |
1667 | struct ceph_pg_pool_info *pool, | 1746 | struct ceph_pg_pool_info *pi, |
1668 | int *osds, int len, int *primary) | 1747 | u32 pps, |
1748 | struct ceph_osds *up) | ||
1669 | { | 1749 | { |
1670 | int i; | 1750 | int i; |
1671 | int pos = -1; | 1751 | int pos = -1; |
@@ -1677,8 +1757,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, | |||
1677 | if (!osdmap->osd_primary_affinity) | 1757 | if (!osdmap->osd_primary_affinity) |
1678 | return; | 1758 | return; |
1679 | 1759 | ||
1680 | for (i = 0; i < len; i++) { | 1760 | for (i = 0; i < up->size; i++) { |
1681 | int osd = osds[i]; | 1761 | int osd = up->osds[i]; |
1682 | 1762 | ||
1683 | if (osd != CRUSH_ITEM_NONE && | 1763 | if (osd != CRUSH_ITEM_NONE && |
1684 | osdmap->osd_primary_affinity[osd] != | 1764 | osdmap->osd_primary_affinity[osd] != |
@@ -1686,7 +1766,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, | |||
1686 | break; | 1766 | break; |
1687 | } | 1767 | } |
1688 | } | 1768 | } |
1689 | if (i == len) | 1769 | if (i == up->size) |
1690 | return; | 1770 | return; |
1691 | 1771 | ||
1692 | /* | 1772 | /* |
@@ -1694,8 +1774,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, | |||
1694 | * osd into the hash/rng so that a proportional fraction of an | 1774 | * osd into the hash/rng so that a proportional fraction of an |
1695 | * osd's pgs get rejected as primary. | 1775 | * osd's pgs get rejected as primary. |
1696 | */ | 1776 | */ |
1697 | for (i = 0; i < len; i++) { | 1777 | for (i = 0; i < up->size; i++) { |
1698 | int osd = osds[i]; | 1778 | int osd = up->osds[i]; |
1699 | u32 aff; | 1779 | u32 aff; |
1700 | 1780 | ||
1701 | if (osd == CRUSH_ITEM_NONE) | 1781 | if (osd == CRUSH_ITEM_NONE) |
@@ -1720,123 +1800,99 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, | |||
1720 | if (pos < 0) | 1800 | if (pos < 0) |
1721 | return; | 1801 | return; |
1722 | 1802 | ||
1723 | *primary = osds[pos]; | 1803 | up->primary = up->osds[pos]; |
1724 | 1804 | ||
1725 | if (ceph_can_shift_osds(pool) && pos > 0) { | 1805 | if (ceph_can_shift_osds(pi) && pos > 0) { |
1726 | /* move the new primary to the front */ | 1806 | /* move the new primary to the front */ |
1727 | for (i = pos; i > 0; i--) | 1807 | for (i = pos; i > 0; i--) |
1728 | osds[i] = osds[i - 1]; | 1808 | up->osds[i] = up->osds[i - 1]; |
1729 | osds[0] = *primary; | 1809 | up->osds[0] = up->primary; |
1730 | } | 1810 | } |
1731 | } | 1811 | } |
1732 | 1812 | ||
1733 | /* | 1813 | /* |
1734 | * Given up set, apply pg_temp and primary_temp mappings. | 1814 | * Get pg_temp and primary_temp mappings for given PG. |
1735 | * | 1815 | * |
1736 | * Return acting set length. *primary is set to acting primary osd id, | 1816 | * Note that a PG may have none, only pg_temp, only primary_temp or |
1737 | * or -1 if acting set is empty. | 1817 | * both pg_temp and primary_temp mappings. This means @temp isn't |
1818 | * always a valid OSD set on return: in the "only primary_temp" case, | ||
1819 | * @temp will have its ->primary >= 0 but ->size == 0. | ||
1738 | */ | 1820 | */ |
1739 | static int apply_temps(struct ceph_osdmap *osdmap, | 1821 | static void get_temp_osds(struct ceph_osdmap *osdmap, |
1740 | struct ceph_pg_pool_info *pool, struct ceph_pg pgid, | 1822 | struct ceph_pg_pool_info *pi, |
1741 | int *osds, int len, int *primary) | 1823 | const struct ceph_pg *raw_pgid, |
1824 | struct ceph_osds *temp) | ||
1742 | { | 1825 | { |
1826 | struct ceph_pg pgid; | ||
1743 | struct ceph_pg_mapping *pg; | 1827 | struct ceph_pg_mapping *pg; |
1744 | int temp_len; | ||
1745 | int temp_primary; | ||
1746 | int i; | 1828 | int i; |
1747 | 1829 | ||
1748 | /* raw_pg -> pg */ | 1830 | raw_pg_to_pg(pi, raw_pgid, &pgid); |
1749 | pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, | 1831 | ceph_osds_init(temp); |
1750 | pool->pg_num_mask); | ||
1751 | 1832 | ||
1752 | /* pg_temp? */ | 1833 | /* pg_temp? */ |
1753 | pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); | 1834 | pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); |
1754 | if (pg) { | 1835 | if (pg) { |
1755 | temp_len = 0; | ||
1756 | temp_primary = -1; | ||
1757 | |||
1758 | for (i = 0; i < pg->pg_temp.len; i++) { | 1836 | for (i = 0; i < pg->pg_temp.len; i++) { |
1759 | if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { | 1837 | if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { |
1760 | if (ceph_can_shift_osds(pool)) | 1838 | if (ceph_can_shift_osds(pi)) |
1761 | continue; | 1839 | continue; |
1762 | else | 1840 | |
1763 | osds[temp_len++] = CRUSH_ITEM_NONE; | 1841 | temp->osds[temp->size++] = CRUSH_ITEM_NONE; |
1764 | } else { | 1842 | } else { |
1765 | osds[temp_len++] = pg->pg_temp.osds[i]; | 1843 | temp->osds[temp->size++] = pg->pg_temp.osds[i]; |
1766 | } | 1844 | } |
1767 | } | 1845 | } |
1768 | 1846 | ||
1769 | /* apply pg_temp's primary */ | 1847 | /* apply pg_temp's primary */ |
1770 | for (i = 0; i < temp_len; i++) { | 1848 | for (i = 0; i < temp->size; i++) { |
1771 | if (osds[i] != CRUSH_ITEM_NONE) { | 1849 | if (temp->osds[i] != CRUSH_ITEM_NONE) { |
1772 | temp_primary = osds[i]; | 1850 | temp->primary = temp->osds[i]; |
1773 | break; | 1851 | break; |
1774 | } | 1852 | } |
1775 | } | 1853 | } |
1776 | } else { | ||
1777 | temp_len = len; | ||
1778 | temp_primary = *primary; | ||
1779 | } | 1854 | } |
1780 | 1855 | ||
1781 | /* primary_temp? */ | 1856 | /* primary_temp? */ |
1782 | pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); | 1857 | pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); |
1783 | if (pg) | 1858 | if (pg) |
1784 | temp_primary = pg->primary_temp.osd; | 1859 | temp->primary = pg->primary_temp.osd; |
1785 | |||
1786 | *primary = temp_primary; | ||
1787 | return temp_len; | ||
1788 | } | 1860 | } |
1789 | 1861 | ||
1790 | /* | 1862 | /* |
1791 | * Calculate acting set for given pgid. | 1863 | * Map a PG to its acting set as well as its up set. |
1792 | * | 1864 | * |
1793 | * Return acting set length, or error. *primary is set to acting | 1865 | * Acting set is used for data mapping purposes, while up set can be |
1794 | * primary osd id, or -1 if acting set is empty or on error. | 1866 | * recorded for detecting interval changes and deciding whether to |
1867 | * resend a request. | ||
1795 | */ | 1868 | */ |
1796 | int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, | 1869 | void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, |
1797 | int *osds, int *primary) | 1870 | const struct ceph_pg *raw_pgid, |
1871 | struct ceph_osds *up, | ||
1872 | struct ceph_osds *acting) | ||
1798 | { | 1873 | { |
1799 | struct ceph_pg_pool_info *pool; | 1874 | struct ceph_pg_pool_info *pi; |
1800 | u32 pps; | 1875 | u32 pps; |
1801 | int len; | ||
1802 | 1876 | ||
1803 | pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); | 1877 | pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool); |
1804 | if (!pool) { | 1878 | if (!pi) { |
1805 | *primary = -1; | 1879 | ceph_osds_init(up); |
1806 | return -ENOENT; | 1880 | ceph_osds_init(acting); |
1881 | goto out; | ||
1807 | } | 1882 | } |
1808 | 1883 | ||
1809 | if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { | 1884 | pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps); |
1810 | /* hash pool id and seed so that pool PGs do not overlap */ | 1885 | raw_to_up_osds(osdmap, pi, up); |
1811 | pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, | 1886 | apply_primary_affinity(osdmap, pi, pps, up); |
1812 | ceph_stable_mod(pgid.seed, pool->pgp_num, | 1887 | get_temp_osds(osdmap, pi, raw_pgid, acting); |
1813 | pool->pgp_num_mask), | 1888 | if (!acting->size) { |
1814 | pgid.pool); | 1889 | memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0])); |
1815 | } else { | 1890 | acting->size = up->size; |
1816 | /* | 1891 | if (acting->primary == -1) |
1817 | * legacy behavior: add ps and pool together. this is | 1892 | acting->primary = up->primary; |
1818 | * not a great approach because the PGs from each pool | ||
1819 | * will overlap on top of each other: 0.5 == 1.4 == | ||
1820 | * 2.3 == ... | ||
1821 | */ | ||
1822 | pps = ceph_stable_mod(pgid.seed, pool->pgp_num, | ||
1823 | pool->pgp_num_mask) + | ||
1824 | (unsigned)pgid.pool; | ||
1825 | } | ||
1826 | |||
1827 | len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); | ||
1828 | if (len < 0) { | ||
1829 | *primary = -1; | ||
1830 | return len; | ||
1831 | } | 1893 | } |
1832 | 1894 | out: | |
1833 | len = raw_to_up_osds(osdmap, pool, osds, len, primary); | 1895 | WARN_ON(!osds_valid(up) || !osds_valid(acting)); |
1834 | |||
1835 | apply_primary_affinity(osdmap, pps, pool, osds, len, primary); | ||
1836 | |||
1837 | len = apply_temps(osdmap, pool, pgid, osds, len, primary); | ||
1838 | |||
1839 | return len; | ||
1840 | } | 1896 | } |
1841 | 1897 | ||
1842 | /* | 1898 | /* |
@@ -1844,11 +1900,9 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, | |||
1844 | */ | 1900 | */ |
1845 | int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) | 1901 | int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) |
1846 | { | 1902 | { |
1847 | int osds[CEPH_PG_MAX_SIZE]; | 1903 | struct ceph_osds up, acting; |
1848 | int primary; | ||
1849 | |||
1850 | ceph_calc_pg_acting(osdmap, pgid, osds, &primary); | ||
1851 | 1904 | ||
1852 | return primary; | 1905 | ceph_pg_to_up_acting_osds(osdmap, &pgid, &up, &acting); |
1906 | return acting.primary; | ||
1853 | } | 1907 | } |
1854 | EXPORT_SYMBOL(ceph_calc_pg_primary); | 1908 | EXPORT_SYMBOL(ceph_calc_pg_primary); |