diff options
author | Eric Dumazet <edumazet@google.com> | 2018-04-16 13:33:38 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2018-04-16 18:26:37 -0400 |
commit | 93ab6cc69162775201587cc9da00d5016dc890e2 (patch) | |
tree | dad02afad888037a78dc48b9aeb66e0954897dd8 /net/ipv4/tcp.c | |
parent | 03f45c883c6f391ed4fff8292415b35bd1107519 (diff) |
tcp: implement mmap() for zero copy receive
Some networks can make sure TCP payload can exactly fit 4KB pages,
with well chosen MSS/MTU and architectures.
Implement mmap() system call so that applications can avoid
copying data without complex splice() games.
Note that a successful mmap( X bytes) on TCP socket is consuming
bytes, as if recvmsg() has been done. (tp->copied += X)
Only PROT_READ mappings are accepted, as skb page frags
are fundamentally shared and read only.
If tcp_mmap() finds data that is not a full page, or a patch of
urgent data, -EINVAL is returned, no bytes are consumed.
Application must fallback to recvmsg() to read the problematic sequence.
mmap() wont block, regardless of socket being in blocking or
non-blocking mode. If not enough bytes are in receive queue,
mmap() would return -EAGAIN, or -EIO if socket is in a state
where no other bytes can be added into receive queue.
An application might use SO_RCVLOWAT, poll() and/or ioctl( FIONREAD)
to efficiently use mmap()
On the sender side, MSG_EOR might help to clearly separate unaligned
headers and 4K-aligned chunks if necessary.
Tested:
mlx4 (cx-3) 40Gbit NIC, with tcp_mmap program provided in following patch.
MTU set to 4168 (4096 TCP payload, 40 bytes IPv6 header, 32 bytes TCP header)
Without mmap() (tcp_mmap -s)
received 32768 MB (0 % mmap'ed) in 8.13342 s, 33.7961 Gbit,
cpu usage user:0.034 sys:3.778, 116.333 usec per MB, 63062 c-switches
received 32768 MB (0 % mmap'ed) in 8.14501 s, 33.748 Gbit,
cpu usage user:0.029 sys:3.997, 122.864 usec per MB, 61903 c-switches
received 32768 MB (0 % mmap'ed) in 8.11723 s, 33.8635 Gbit,
cpu usage user:0.048 sys:3.964, 122.437 usec per MB, 62983 c-switches
received 32768 MB (0 % mmap'ed) in 8.39189 s, 32.7552 Gbit,
cpu usage user:0.038 sys:4.181, 128.754 usec per MB, 55834 c-switches
With mmap() on receiver (tcp_mmap -s -z)
received 32768 MB (100 % mmap'ed) in 8.03083 s, 34.2278 Gbit,
cpu usage user:0.024 sys:1.466, 45.4712 usec per MB, 65479 c-switches
received 32768 MB (100 % mmap'ed) in 7.98805 s, 34.4111 Gbit,
cpu usage user:0.026 sys:1.401, 43.5486 usec per MB, 65447 c-switches
received 32768 MB (100 % mmap'ed) in 7.98377 s, 34.4296 Gbit,
cpu usage user:0.028 sys:1.452, 45.166 usec per MB, 65496 c-switches
received 32768 MB (99.9969 % mmap'ed) in 8.01838 s, 34.281 Gbit,
cpu usage user:0.02 sys:1.446, 44.7388 usec per MB, 65505 c-switches
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r-- | net/ipv4/tcp.c | 113 |
1 files changed, 113 insertions, 0 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c768d306b657..438fbca96cd3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -1726,6 +1726,119 @@ int tcp_set_rcvlowat(struct sock *sk, int val) | |||
1726 | } | 1726 | } |
1727 | EXPORT_SYMBOL(tcp_set_rcvlowat); | 1727 | EXPORT_SYMBOL(tcp_set_rcvlowat); |
1728 | 1728 | ||
1729 | /* When user wants to mmap X pages, we first need to perform the mapping | ||
1730 | * before freeing any skbs in receive queue, otherwise user would be unable | ||
1731 | * to fallback to standard recvmsg(). This happens if some data in the | ||
1732 | * requested block is not exactly fitting in a page. | ||
1733 | * | ||
1734 | * We only support order-0 pages for the moment. | ||
1735 | * mmap() on TCP is very strict, there is no point | ||
1736 | * trying to accommodate with pathological layouts. | ||
1737 | */ | ||
1738 | int tcp_mmap(struct file *file, struct socket *sock, | ||
1739 | struct vm_area_struct *vma) | ||
1740 | { | ||
1741 | unsigned long size = vma->vm_end - vma->vm_start; | ||
1742 | unsigned int nr_pages = size >> PAGE_SHIFT; | ||
1743 | struct page **pages_array = NULL; | ||
1744 | u32 seq, len, offset, nr = 0; | ||
1745 | struct sock *sk = sock->sk; | ||
1746 | const skb_frag_t *frags; | ||
1747 | struct tcp_sock *tp; | ||
1748 | struct sk_buff *skb; | ||
1749 | int ret; | ||
1750 | |||
1751 | if (vma->vm_pgoff || !nr_pages) | ||
1752 | return -EINVAL; | ||
1753 | |||
1754 | if (vma->vm_flags & VM_WRITE) | ||
1755 | return -EPERM; | ||
1756 | /* TODO: Maybe the following is not needed if pages are COW */ | ||
1757 | vma->vm_flags &= ~VM_MAYWRITE; | ||
1758 | |||
1759 | lock_sock(sk); | ||
1760 | |||
1761 | ret = -ENOTCONN; | ||
1762 | if (sk->sk_state == TCP_LISTEN) | ||
1763 | goto out; | ||
1764 | |||
1765 | sock_rps_record_flow(sk); | ||
1766 | |||
1767 | if (tcp_inq(sk) < size) { | ||
1768 | ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN; | ||
1769 | goto out; | ||
1770 | } | ||
1771 | tp = tcp_sk(sk); | ||
1772 | seq = tp->copied_seq; | ||
1773 | /* Abort if urgent data is in the area */ | ||
1774 | if (unlikely(tp->urg_data)) { | ||
1775 | u32 urg_offset = tp->urg_seq - seq; | ||
1776 | |||
1777 | ret = -EINVAL; | ||
1778 | if (urg_offset < size) | ||
1779 | goto out; | ||
1780 | } | ||
1781 | ret = -ENOMEM; | ||
1782 | pages_array = kvmalloc_array(nr_pages, sizeof(struct page *), | ||
1783 | GFP_KERNEL); | ||
1784 | if (!pages_array) | ||
1785 | goto out; | ||
1786 | skb = tcp_recv_skb(sk, seq, &offset); | ||
1787 | ret = -EINVAL; | ||
1788 | skb_start: | ||
1789 | /* We do not support anything not in page frags */ | ||
1790 | offset -= skb_headlen(skb); | ||
1791 | if ((int)offset < 0) | ||
1792 | goto out; | ||
1793 | if (skb_has_frag_list(skb)) | ||
1794 | goto out; | ||
1795 | len = skb->data_len - offset; | ||
1796 | frags = skb_shinfo(skb)->frags; | ||
1797 | while (offset) { | ||
1798 | if (frags->size > offset) | ||
1799 | goto out; | ||
1800 | offset -= frags->size; | ||
1801 | frags++; | ||
1802 | } | ||
1803 | while (nr < nr_pages) { | ||
1804 | if (len) { | ||
1805 | if (len < PAGE_SIZE) | ||
1806 | goto out; | ||
1807 | if (frags->size != PAGE_SIZE || frags->page_offset) | ||
1808 | goto out; | ||
1809 | pages_array[nr++] = skb_frag_page(frags); | ||
1810 | frags++; | ||
1811 | len -= PAGE_SIZE; | ||
1812 | seq += PAGE_SIZE; | ||
1813 | continue; | ||
1814 | } | ||
1815 | skb = skb->next; | ||
1816 | offset = seq - TCP_SKB_CB(skb)->seq; | ||
1817 | goto skb_start; | ||
1818 | } | ||
1819 | /* OK, we have a full set of pages ready to be inserted into vma */ | ||
1820 | for (nr = 0; nr < nr_pages; nr++) { | ||
1821 | ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT), | ||
1822 | pages_array[nr]); | ||
1823 | if (ret) | ||
1824 | goto out; | ||
1825 | } | ||
1826 | /* operation is complete, we can 'consume' all skbs */ | ||
1827 | tp->copied_seq = seq; | ||
1828 | tcp_rcv_space_adjust(sk); | ||
1829 | |||
1830 | /* Clean up data we have read: This will do ACK frames. */ | ||
1831 | tcp_recv_skb(sk, seq, &offset); | ||
1832 | tcp_cleanup_rbuf(sk, size); | ||
1833 | |||
1834 | ret = 0; | ||
1835 | out: | ||
1836 | release_sock(sk); | ||
1837 | kvfree(pages_array); | ||
1838 | return ret; | ||
1839 | } | ||
1840 | EXPORT_SYMBOL(tcp_mmap); | ||
1841 | |||
1729 | static void tcp_update_recv_tstamps(struct sk_buff *skb, | 1842 | static void tcp_update_recv_tstamps(struct sk_buff *skb, |
1730 | struct scm_timestamping *tss) | 1843 | struct scm_timestamping *tss) |
1731 | { | 1844 | { |