aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2018-04-16 13:33:38 -0400
committerDavid S. Miller <davem@davemloft.net>2018-04-16 18:26:37 -0400
commit93ab6cc69162775201587cc9da00d5016dc890e2 (patch)
treedad02afad888037a78dc48b9aeb66e0954897dd8 /net/ipv4/tcp.c
parent03f45c883c6f391ed4fff8292415b35bd1107519 (diff)
tcp: implement mmap() for zero copy receive
Some networks can make sure TCP payload can exactly fit 4KB pages, with well chosen MSS/MTU and architectures. Implement mmap() system call so that applications can avoid copying data without complex splice() games. Note that a successful mmap( X bytes) on TCP socket is consuming bytes, as if recvmsg() has been done. (tp->copied += X) Only PROT_READ mappings are accepted, as skb page frags are fundamentally shared and read only. If tcp_mmap() finds data that is not a full page, or a patch of urgent data, -EINVAL is returned, no bytes are consumed. Application must fallback to recvmsg() to read the problematic sequence. mmap() wont block, regardless of socket being in blocking or non-blocking mode. If not enough bytes are in receive queue, mmap() would return -EAGAIN, or -EIO if socket is in a state where no other bytes can be added into receive queue. An application might use SO_RCVLOWAT, poll() and/or ioctl( FIONREAD) to efficiently use mmap() On the sender side, MSG_EOR might help to clearly separate unaligned headers and 4K-aligned chunks if necessary. Tested: mlx4 (cx-3) 40Gbit NIC, with tcp_mmap program provided in following patch. MTU set to 4168 (4096 TCP payload, 40 bytes IPv6 header, 32 bytes TCP header) Without mmap() (tcp_mmap -s) received 32768 MB (0 % mmap'ed) in 8.13342 s, 33.7961 Gbit, cpu usage user:0.034 sys:3.778, 116.333 usec per MB, 63062 c-switches received 32768 MB (0 % mmap'ed) in 8.14501 s, 33.748 Gbit, cpu usage user:0.029 sys:3.997, 122.864 usec per MB, 61903 c-switches received 32768 MB (0 % mmap'ed) in 8.11723 s, 33.8635 Gbit, cpu usage user:0.048 sys:3.964, 122.437 usec per MB, 62983 c-switches received 32768 MB (0 % mmap'ed) in 8.39189 s, 32.7552 Gbit, cpu usage user:0.038 sys:4.181, 128.754 usec per MB, 55834 c-switches With mmap() on receiver (tcp_mmap -s -z) received 32768 MB (100 % mmap'ed) in 8.03083 s, 34.2278 Gbit, cpu usage user:0.024 sys:1.466, 45.4712 usec per MB, 65479 c-switches received 32768 MB (100 % mmap'ed) in 7.98805 s, 34.4111 Gbit, cpu usage user:0.026 sys:1.401, 43.5486 usec per MB, 65447 c-switches received 32768 MB (100 % mmap'ed) in 7.98377 s, 34.4296 Gbit, cpu usage user:0.028 sys:1.452, 45.166 usec per MB, 65496 c-switches received 32768 MB (99.9969 % mmap'ed) in 8.01838 s, 34.281 Gbit, cpu usage user:0.02 sys:1.446, 44.7388 usec per MB, 65505 c-switches Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c113
1 files changed, 113 insertions, 0 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c768d306b657..438fbca96cd3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1726,6 +1726,119 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
1726} 1726}
1727EXPORT_SYMBOL(tcp_set_rcvlowat); 1727EXPORT_SYMBOL(tcp_set_rcvlowat);
1728 1728
1729/* When user wants to mmap X pages, we first need to perform the mapping
1730 * before freeing any skbs in receive queue, otherwise user would be unable
1731 * to fallback to standard recvmsg(). This happens if some data in the
1732 * requested block is not exactly fitting in a page.
1733 *
1734 * We only support order-0 pages for the moment.
1735 * mmap() on TCP is very strict, there is no point
1736 * trying to accommodate with pathological layouts.
1737 */
1738int tcp_mmap(struct file *file, struct socket *sock,
1739 struct vm_area_struct *vma)
1740{
1741 unsigned long size = vma->vm_end - vma->vm_start;
1742 unsigned int nr_pages = size >> PAGE_SHIFT;
1743 struct page **pages_array = NULL;
1744 u32 seq, len, offset, nr = 0;
1745 struct sock *sk = sock->sk;
1746 const skb_frag_t *frags;
1747 struct tcp_sock *tp;
1748 struct sk_buff *skb;
1749 int ret;
1750
1751 if (vma->vm_pgoff || !nr_pages)
1752 return -EINVAL;
1753
1754 if (vma->vm_flags & VM_WRITE)
1755 return -EPERM;
1756 /* TODO: Maybe the following is not needed if pages are COW */
1757 vma->vm_flags &= ~VM_MAYWRITE;
1758
1759 lock_sock(sk);
1760
1761 ret = -ENOTCONN;
1762 if (sk->sk_state == TCP_LISTEN)
1763 goto out;
1764
1765 sock_rps_record_flow(sk);
1766
1767 if (tcp_inq(sk) < size) {
1768 ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
1769 goto out;
1770 }
1771 tp = tcp_sk(sk);
1772 seq = tp->copied_seq;
1773 /* Abort if urgent data is in the area */
1774 if (unlikely(tp->urg_data)) {
1775 u32 urg_offset = tp->urg_seq - seq;
1776
1777 ret = -EINVAL;
1778 if (urg_offset < size)
1779 goto out;
1780 }
1781 ret = -ENOMEM;
1782 pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
1783 GFP_KERNEL);
1784 if (!pages_array)
1785 goto out;
1786 skb = tcp_recv_skb(sk, seq, &offset);
1787 ret = -EINVAL;
1788skb_start:
1789 /* We do not support anything not in page frags */
1790 offset -= skb_headlen(skb);
1791 if ((int)offset < 0)
1792 goto out;
1793 if (skb_has_frag_list(skb))
1794 goto out;
1795 len = skb->data_len - offset;
1796 frags = skb_shinfo(skb)->frags;
1797 while (offset) {
1798 if (frags->size > offset)
1799 goto out;
1800 offset -= frags->size;
1801 frags++;
1802 }
1803 while (nr < nr_pages) {
1804 if (len) {
1805 if (len < PAGE_SIZE)
1806 goto out;
1807 if (frags->size != PAGE_SIZE || frags->page_offset)
1808 goto out;
1809 pages_array[nr++] = skb_frag_page(frags);
1810 frags++;
1811 len -= PAGE_SIZE;
1812 seq += PAGE_SIZE;
1813 continue;
1814 }
1815 skb = skb->next;
1816 offset = seq - TCP_SKB_CB(skb)->seq;
1817 goto skb_start;
1818 }
1819 /* OK, we have a full set of pages ready to be inserted into vma */
1820 for (nr = 0; nr < nr_pages; nr++) {
1821 ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
1822 pages_array[nr]);
1823 if (ret)
1824 goto out;
1825 }
1826 /* operation is complete, we can 'consume' all skbs */
1827 tp->copied_seq = seq;
1828 tcp_rcv_space_adjust(sk);
1829
1830 /* Clean up data we have read: This will do ACK frames. */
1831 tcp_recv_skb(sk, seq, &offset);
1832 tcp_cleanup_rbuf(sk, size);
1833
1834 ret = 0;
1835out:
1836 release_sock(sk);
1837 kvfree(pages_array);
1838 return ret;
1839}
1840EXPORT_SYMBOL(tcp_mmap);
1841
1729static void tcp_update_recv_tstamps(struct sk_buff *skb, 1842static void tcp_update_recv_tstamps(struct sk_buff *skb,
1730 struct scm_timestamping *tss) 1843 struct scm_timestamping *tss)
1731{ 1844{