diff options
author | Sage Weil <sage@newdream.net> | 2010-11-09 15:43:12 -0500 |
---|---|---|
committer | Sage Weil <sage@newdream.net> | 2010-11-09 15:43:12 -0500 |
commit | b7495fc2ff941db6a118a93ab8d61149e3f4cef8 (patch) | |
tree | 231c339d74760e2fa13e5e6f41c10bc28cea51b3 /net | |
parent | e98b6fed84d0f0155d7b398e0dfeac74c792f2d0 (diff) |
ceph: make page alignment explicit in osd interface
We used to infer alignment of IOs within a page based on the file offset,
which assumed they matched. This broke with direct IO that was not aligned
to pages (e.g., 512-byte aligned IO). We were also trusting the alignment
specified in the OSD reply, which could have been adjusted by the server.
Explicitly specify the page alignment when setting up OSD IO requests.
Signed-off-by: Sage Weil <sage@newdream.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/ceph/osd_client.c | 22 |
1 files changed, 14 insertions, 8 deletions
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 79391994b3ed..6c096239660c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c | |||
@@ -71,6 +71,7 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc, | |||
71 | op->extent.length = objlen; | 71 | op->extent.length = objlen; |
72 | } | 72 | } |
73 | req->r_num_pages = calc_pages_for(off, *plen); | 73 | req->r_num_pages = calc_pages_for(off, *plen); |
74 | req->r_page_alignment = off & ~PAGE_MASK; | ||
74 | if (op->op == CEPH_OSD_OP_WRITE) | 75 | if (op->op == CEPH_OSD_OP_WRITE) |
75 | op->payload_len = *plen; | 76 | op->payload_len = *plen; |
76 | 77 | ||
@@ -419,7 +420,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, | |||
419 | u32 truncate_seq, | 420 | u32 truncate_seq, |
420 | u64 truncate_size, | 421 | u64 truncate_size, |
421 | struct timespec *mtime, | 422 | struct timespec *mtime, |
422 | bool use_mempool, int num_reply) | 423 | bool use_mempool, int num_reply, |
424 | int page_align) | ||
423 | { | 425 | { |
424 | struct ceph_osd_req_op ops[3]; | 426 | struct ceph_osd_req_op ops[3]; |
425 | struct ceph_osd_request *req; | 427 | struct ceph_osd_request *req; |
@@ -447,6 +449,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, | |||
447 | calc_layout(osdc, vino, layout, off, plen, req, ops); | 449 | calc_layout(osdc, vino, layout, off, plen, req, ops); |
448 | req->r_file_layout = *layout; /* keep a copy */ | 450 | req->r_file_layout = *layout; /* keep a copy */ |
449 | 451 | ||
452 | /* in case it differs from natural alignment that calc_layout | ||
453 | filled in for us */ | ||
454 | req->r_page_alignment = page_align; | ||
455 | |||
450 | ceph_osdc_build_request(req, off, plen, ops, | 456 | ceph_osdc_build_request(req, off, plen, ops, |
451 | snapc, | 457 | snapc, |
452 | mtime, | 458 | mtime, |
@@ -1489,7 +1495,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, | |||
1489 | struct ceph_vino vino, struct ceph_file_layout *layout, | 1495 | struct ceph_vino vino, struct ceph_file_layout *layout, |
1490 | u64 off, u64 *plen, | 1496 | u64 off, u64 *plen, |
1491 | u32 truncate_seq, u64 truncate_size, | 1497 | u32 truncate_seq, u64 truncate_size, |
1492 | struct page **pages, int num_pages) | 1498 | struct page **pages, int num_pages, int page_align) |
1493 | { | 1499 | { |
1494 | struct ceph_osd_request *req; | 1500 | struct ceph_osd_request *req; |
1495 | int rc = 0; | 1501 | int rc = 0; |
@@ -1499,15 +1505,15 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, | |||
1499 | req = ceph_osdc_new_request(osdc, layout, vino, off, plen, | 1505 | req = ceph_osdc_new_request(osdc, layout, vino, off, plen, |
1500 | CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, | 1506 | CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, |
1501 | NULL, 0, truncate_seq, truncate_size, NULL, | 1507 | NULL, 0, truncate_seq, truncate_size, NULL, |
1502 | false, 1); | 1508 | false, 1, page_align); |
1503 | if (!req) | 1509 | if (!req) |
1504 | return -ENOMEM; | 1510 | return -ENOMEM; |
1505 | 1511 | ||
1506 | /* it may be a short read due to an object boundary */ | 1512 | /* it may be a short read due to an object boundary */ |
1507 | req->r_pages = pages; | 1513 | req->r_pages = pages; |
1508 | 1514 | ||
1509 | dout("readpages final extent is %llu~%llu (%d pages)\n", | 1515 | dout("readpages final extent is %llu~%llu (%d pages align %d)\n", |
1510 | off, *plen, req->r_num_pages); | 1516 | off, *plen, req->r_num_pages, page_align); |
1511 | 1517 | ||
1512 | rc = ceph_osdc_start_request(osdc, req, false); | 1518 | rc = ceph_osdc_start_request(osdc, req, false); |
1513 | if (!rc) | 1519 | if (!rc) |
@@ -1533,6 +1539,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, | |||
1533 | { | 1539 | { |
1534 | struct ceph_osd_request *req; | 1540 | struct ceph_osd_request *req; |
1535 | int rc = 0; | 1541 | int rc = 0; |
1542 | int page_align = off & ~PAGE_MASK; | ||
1536 | 1543 | ||
1537 | BUG_ON(vino.snap != CEPH_NOSNAP); | 1544 | BUG_ON(vino.snap != CEPH_NOSNAP); |
1538 | req = ceph_osdc_new_request(osdc, layout, vino, off, &len, | 1545 | req = ceph_osdc_new_request(osdc, layout, vino, off, &len, |
@@ -1541,7 +1548,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, | |||
1541 | CEPH_OSD_FLAG_WRITE, | 1548 | CEPH_OSD_FLAG_WRITE, |
1542 | snapc, do_sync, | 1549 | snapc, do_sync, |
1543 | truncate_seq, truncate_size, mtime, | 1550 | truncate_seq, truncate_size, mtime, |
1544 | nofail, 1); | 1551 | nofail, 1, page_align); |
1545 | if (!req) | 1552 | if (!req) |
1546 | return -ENOMEM; | 1553 | return -ENOMEM; |
1547 | 1554 | ||
@@ -1638,8 +1645,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, | |||
1638 | m = ceph_msg_get(req->r_reply); | 1645 | m = ceph_msg_get(req->r_reply); |
1639 | 1646 | ||
1640 | if (data_len > 0) { | 1647 | if (data_len > 0) { |
1641 | unsigned data_off = le16_to_cpu(hdr->data_off); | 1648 | int want = calc_pages_for(req->r_page_alignment, data_len); |
1642 | int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); | ||
1643 | 1649 | ||
1644 | if (unlikely(req->r_num_pages < want)) { | 1650 | if (unlikely(req->r_num_pages < want)) { |
1645 | pr_warning("tid %lld reply %d > expected %d pages\n", | 1651 | pr_warning("tid %lld reply %d > expected %d pages\n", |