diff options
| author | Ilya Dryomov <idryomov@gmail.com> | 2018-11-20 09:44:00 -0500 |
|---|---|---|
| committer | Ilya Dryomov <idryomov@gmail.com> | 2018-12-26 09:56:04 -0500 |
| commit | 433b0a12953bc1dfcb52febb186136395a65aad0 (patch) | |
| tree | ae43992cb2721ae51f4b91be78570af8d14b6b95 /net | |
| parent | 3239eb5215ebdef593a79316c9dbbdf8849166ec (diff) | |
libceph: use MSG_SENDPAGE_NOTLAST with ceph_tcp_sendpage()
Prevent do_tcp_sendpages() from calling tcp_push() (at least) once per
page. Instead, arrange for tcp_push() to be called (at least) once per
data payload. This results in more MSS-sized packets and fewer packets
overall (5-10% reduction in my tests with typical OSD request sizes).
See commits 2f5338442425 ("tcp: allow splice() to build full TSO
packets"), 35f9c09fe9c7 ("tcp: tcp_sendpages() should call tcp_push()
once") and ae62ca7b0321 ("tcp: fix MSG_SENDPAGE_NOTLAST logic") for
details.
Here is an example of a packet size histogram for 128K OSD requests
(MSS = 1448, top 5):
Before:
SIZE COUNT
1448 777700
952 127915
1200 39238
1219 9806
21 5675
After:
SIZE COUNT
1448 897280
21 6201
1019 2797
643 2739
376 2479
We could do slightly better by explicitly corking the socket but it's
not clear it's worth it.
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Diffstat (limited to 'net')
| -rw-r--r-- | net/ceph/messenger.c | 17 |
1 files changed, 13 insertions, 4 deletions
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 21a743a3bd29..649faa626b35 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c | |||
| @@ -560,12 +560,15 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, | |||
| 560 | return r; | 560 | return r; |
| 561 | } | 561 | } |
| 562 | 562 | ||
| 563 | /* | ||
| 564 | * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST | ||
| 565 | */ | ||
| 563 | static int ceph_tcp_sendpage(struct socket *sock, struct page *page, | 566 | static int ceph_tcp_sendpage(struct socket *sock, struct page *page, |
| 564 | int offset, size_t size, bool more) | 567 | int offset, size_t size, int more) |
| 565 | { | 568 | { |
| 566 | ssize_t (*sendpage)(struct socket *sock, struct page *page, | 569 | ssize_t (*sendpage)(struct socket *sock, struct page *page, |
| 567 | int offset, size_t size, int flags); | 570 | int offset, size_t size, int flags); |
| 568 | int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : 0); | 571 | int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more; |
| 569 | int ret; | 572 | int ret; |
| 570 | 573 | ||
| 571 | /* | 574 | /* |
| @@ -1552,6 +1555,7 @@ static int write_partial_message_data(struct ceph_connection *con) | |||
| 1552 | struct ceph_msg *msg = con->out_msg; | 1555 | struct ceph_msg *msg = con->out_msg; |
| 1553 | struct ceph_msg_data_cursor *cursor = &msg->cursor; | 1556 | struct ceph_msg_data_cursor *cursor = &msg->cursor; |
| 1554 | bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); | 1557 | bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); |
| 1558 | int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; | ||
| 1555 | u32 crc; | 1559 | u32 crc; |
| 1556 | 1560 | ||
| 1557 | dout("%s %p msg %p\n", __func__, con, msg); | 1561 | dout("%s %p msg %p\n", __func__, con, msg); |
| @@ -1580,8 +1584,10 @@ static int write_partial_message_data(struct ceph_connection *con) | |||
| 1580 | } | 1584 | } |
| 1581 | 1585 | ||
| 1582 | page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); | 1586 | page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); |
| 1587 | if (length == cursor->total_resid) | ||
| 1588 | more = MSG_MORE; | ||
| 1583 | ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, | 1589 | ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, |
| 1584 | true); | 1590 | more); |
| 1585 | if (ret <= 0) { | 1591 | if (ret <= 0) { |
| 1586 | if (do_datacrc) | 1592 | if (do_datacrc) |
| 1587 | msg->footer.data_crc = cpu_to_le32(crc); | 1593 | msg->footer.data_crc = cpu_to_le32(crc); |
| @@ -1611,13 +1617,16 @@ static int write_partial_message_data(struct ceph_connection *con) | |||
| 1611 | */ | 1617 | */ |
| 1612 | static int write_partial_skip(struct ceph_connection *con) | 1618 | static int write_partial_skip(struct ceph_connection *con) |
| 1613 | { | 1619 | { |
| 1620 | int more = MSG_MORE | MSG_SENDPAGE_NOTLAST; | ||
| 1614 | int ret; | 1621 | int ret; |
| 1615 | 1622 | ||
| 1616 | dout("%s %p %d left\n", __func__, con, con->out_skip); | 1623 | dout("%s %p %d left\n", __func__, con, con->out_skip); |
| 1617 | while (con->out_skip > 0) { | 1624 | while (con->out_skip > 0) { |
| 1618 | size_t size = min(con->out_skip, (int) PAGE_SIZE); | 1625 | size_t size = min(con->out_skip, (int) PAGE_SIZE); |
| 1619 | 1626 | ||
| 1620 | ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true); | 1627 | if (size == con->out_skip) |
| 1628 | more = MSG_MORE; | ||
| 1629 | ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more); | ||
| 1621 | if (ret <= 0) | 1630 | if (ret <= 0) |
| 1622 | goto out; | 1631 | goto out; |
| 1623 | con->out_skip -= ret; | 1632 | con->out_skip -= ret; |
