diff options
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r-- | net/ipv4/tcp.c | 204 |
1 files changed, 199 insertions, 5 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dec47e6789e7..2741953adaba 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -1691,6 +1691,139 @@ int tcp_peek_len(struct socket *sock) | |||
1691 | } | 1691 | } |
1692 | EXPORT_SYMBOL(tcp_peek_len); | 1692 | EXPORT_SYMBOL(tcp_peek_len); |
1693 | 1693 | ||
1694 | /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ | ||
1695 | int tcp_set_rcvlowat(struct sock *sk, int val) | ||
1696 | { | ||
1697 | sk->sk_rcvlowat = val ? : 1; | ||
1698 | |||
1699 | /* Check if we need to signal EPOLLIN right now */ | ||
1700 | tcp_data_ready(sk); | ||
1701 | |||
1702 | if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) | ||
1703 | return 0; | ||
1704 | |||
1705 | /* val comes from user space and might be close to INT_MAX */ | ||
1706 | val <<= 1; | ||
1707 | if (val < 0) | ||
1708 | val = INT_MAX; | ||
1709 | |||
1710 | val = min(val, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); | ||
1711 | if (val > sk->sk_rcvbuf) { | ||
1712 | sk->sk_rcvbuf = val; | ||
1713 | tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); | ||
1714 | } | ||
1715 | return 0; | ||
1716 | } | ||
1717 | EXPORT_SYMBOL(tcp_set_rcvlowat); | ||
1718 | |||
1719 | #ifdef CONFIG_MMU | ||
1720 | static const struct vm_operations_struct tcp_vm_ops = { | ||
1721 | }; | ||
1722 | |||
1723 | int tcp_mmap(struct file *file, struct socket *sock, | ||
1724 | struct vm_area_struct *vma) | ||
1725 | { | ||
1726 | if (vma->vm_flags & (VM_WRITE | VM_EXEC)) | ||
1727 | return -EPERM; | ||
1728 | vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); | ||
1729 | |||
1730 | /* Instruct vm_insert_page() to not down_read(mmap_sem) */ | ||
1731 | vma->vm_flags |= VM_MIXEDMAP; | ||
1732 | |||
1733 | vma->vm_ops = &tcp_vm_ops; | ||
1734 | return 0; | ||
1735 | } | ||
1736 | EXPORT_SYMBOL(tcp_mmap); | ||
1737 | |||
1738 | static int tcp_zerocopy_receive(struct sock *sk, | ||
1739 | struct tcp_zerocopy_receive *zc) | ||
1740 | { | ||
1741 | unsigned long address = (unsigned long)zc->address; | ||
1742 | const skb_frag_t *frags = NULL; | ||
1743 | u32 length = 0, seq, offset; | ||
1744 | struct vm_area_struct *vma; | ||
1745 | struct sk_buff *skb = NULL; | ||
1746 | struct tcp_sock *tp; | ||
1747 | int ret; | ||
1748 | |||
1749 | if (address & (PAGE_SIZE - 1) || address != zc->address) | ||
1750 | return -EINVAL; | ||
1751 | |||
1752 | if (sk->sk_state == TCP_LISTEN) | ||
1753 | return -ENOTCONN; | ||
1754 | |||
1755 | sock_rps_record_flow(sk); | ||
1756 | |||
1757 | down_read(¤t->mm->mmap_sem); | ||
1758 | |||
1759 | ret = -EINVAL; | ||
1760 | vma = find_vma(current->mm, address); | ||
1761 | if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) | ||
1762 | goto out; | ||
1763 | zc->length = min_t(unsigned long, zc->length, vma->vm_end - address); | ||
1764 | |||
1765 | tp = tcp_sk(sk); | ||
1766 | seq = tp->copied_seq; | ||
1767 | zc->length = min_t(u32, zc->length, tcp_inq(sk)); | ||
1768 | zc->length &= ~(PAGE_SIZE - 1); | ||
1769 | |||
1770 | zap_page_range(vma, address, zc->length); | ||
1771 | |||
1772 | zc->recv_skip_hint = 0; | ||
1773 | ret = 0; | ||
1774 | while (length + PAGE_SIZE <= zc->length) { | ||
1775 | if (zc->recv_skip_hint < PAGE_SIZE) { | ||
1776 | if (skb) { | ||
1777 | skb = skb->next; | ||
1778 | offset = seq - TCP_SKB_CB(skb)->seq; | ||
1779 | } else { | ||
1780 | skb = tcp_recv_skb(sk, seq, &offset); | ||
1781 | } | ||
1782 | |||
1783 | zc->recv_skip_hint = skb->len - offset; | ||
1784 | offset -= skb_headlen(skb); | ||
1785 | if ((int)offset < 0 || skb_has_frag_list(skb)) | ||
1786 | break; | ||
1787 | frags = skb_shinfo(skb)->frags; | ||
1788 | while (offset) { | ||
1789 | if (frags->size > offset) | ||
1790 | goto out; | ||
1791 | offset -= frags->size; | ||
1792 | frags++; | ||
1793 | } | ||
1794 | } | ||
1795 | if (frags->size != PAGE_SIZE || frags->page_offset) | ||
1796 | break; | ||
1797 | ret = vm_insert_page(vma, address + length, | ||
1798 | skb_frag_page(frags)); | ||
1799 | if (ret) | ||
1800 | break; | ||
1801 | length += PAGE_SIZE; | ||
1802 | seq += PAGE_SIZE; | ||
1803 | zc->recv_skip_hint -= PAGE_SIZE; | ||
1804 | frags++; | ||
1805 | } | ||
1806 | out: | ||
1807 | up_read(¤t->mm->mmap_sem); | ||
1808 | if (length) { | ||
1809 | tp->copied_seq = seq; | ||
1810 | tcp_rcv_space_adjust(sk); | ||
1811 | |||
1812 | /* Clean up data we have read: This will do ACK frames. */ | ||
1813 | tcp_recv_skb(sk, seq, &offset); | ||
1814 | tcp_cleanup_rbuf(sk, length); | ||
1815 | ret = 0; | ||
1816 | if (length == zc->length) | ||
1817 | zc->recv_skip_hint = 0; | ||
1818 | } else { | ||
1819 | if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE)) | ||
1820 | ret = -EIO; | ||
1821 | } | ||
1822 | zc->length = length; | ||
1823 | return ret; | ||
1824 | } | ||
1825 | #endif | ||
1826 | |||
1694 | static void tcp_update_recv_tstamps(struct sk_buff *skb, | 1827 | static void tcp_update_recv_tstamps(struct sk_buff *skb, |
1695 | struct scm_timestamping *tss) | 1828 | struct scm_timestamping *tss) |
1696 | { | 1829 | { |
@@ -1746,6 +1879,22 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, | |||
1746 | } | 1879 | } |
1747 | } | 1880 | } |
1748 | 1881 | ||
1882 | static int tcp_inq_hint(struct sock *sk) | ||
1883 | { | ||
1884 | const struct tcp_sock *tp = tcp_sk(sk); | ||
1885 | u32 copied_seq = READ_ONCE(tp->copied_seq); | ||
1886 | u32 rcv_nxt = READ_ONCE(tp->rcv_nxt); | ||
1887 | int inq; | ||
1888 | |||
1889 | inq = rcv_nxt - copied_seq; | ||
1890 | if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) { | ||
1891 | lock_sock(sk); | ||
1892 | inq = tp->rcv_nxt - tp->copied_seq; | ||
1893 | release_sock(sk); | ||
1894 | } | ||
1895 | return inq; | ||
1896 | } | ||
1897 | |||
1749 | /* | 1898 | /* |
1750 | * This routine copies from a sock struct into the user buffer. | 1899 | * This routine copies from a sock struct into the user buffer. |
1751 | * | 1900 | * |
@@ -1762,13 +1911,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, | |||
1762 | u32 peek_seq; | 1911 | u32 peek_seq; |
1763 | u32 *seq; | 1912 | u32 *seq; |
1764 | unsigned long used; | 1913 | unsigned long used; |
1765 | int err; | 1914 | int err, inq; |
1766 | int target; /* Read at least this many bytes */ | 1915 | int target; /* Read at least this many bytes */ |
1767 | long timeo; | 1916 | long timeo; |
1768 | struct sk_buff *skb, *last; | 1917 | struct sk_buff *skb, *last; |
1769 | u32 urg_hole = 0; | 1918 | u32 urg_hole = 0; |
1770 | struct scm_timestamping tss; | 1919 | struct scm_timestamping tss; |
1771 | bool has_tss = false; | 1920 | bool has_tss = false; |
1921 | bool has_cmsg; | ||
1772 | 1922 | ||
1773 | if (unlikely(flags & MSG_ERRQUEUE)) | 1923 | if (unlikely(flags & MSG_ERRQUEUE)) |
1774 | return inet_recv_error(sk, msg, len, addr_len); | 1924 | return inet_recv_error(sk, msg, len, addr_len); |
@@ -1783,6 +1933,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, | |||
1783 | if (sk->sk_state == TCP_LISTEN) | 1933 | if (sk->sk_state == TCP_LISTEN) |
1784 | goto out; | 1934 | goto out; |
1785 | 1935 | ||
1936 | has_cmsg = tp->recvmsg_inq; | ||
1786 | timeo = sock_rcvtimeo(sk, nonblock); | 1937 | timeo = sock_rcvtimeo(sk, nonblock); |
1787 | 1938 | ||
1788 | /* Urgent data needs to be handled specially. */ | 1939 | /* Urgent data needs to be handled specially. */ |
@@ -1969,6 +2120,7 @@ skip_copy: | |||
1969 | if (TCP_SKB_CB(skb)->has_rxtstamp) { | 2120 | if (TCP_SKB_CB(skb)->has_rxtstamp) { |
1970 | tcp_update_recv_tstamps(skb, &tss); | 2121 | tcp_update_recv_tstamps(skb, &tss); |
1971 | has_tss = true; | 2122 | has_tss = true; |
2123 | has_cmsg = true; | ||
1972 | } | 2124 | } |
1973 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) | 2125 | if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
1974 | goto found_fin_ok; | 2126 | goto found_fin_ok; |
@@ -1988,13 +2140,20 @@ skip_copy: | |||
1988 | * on connected socket. I was just happy when found this 8) --ANK | 2140 | * on connected socket. I was just happy when found this 8) --ANK |
1989 | */ | 2141 | */ |
1990 | 2142 | ||
1991 | if (has_tss) | ||
1992 | tcp_recv_timestamp(msg, sk, &tss); | ||
1993 | |||
1994 | /* Clean up data we have read: This will do ACK frames. */ | 2143 | /* Clean up data we have read: This will do ACK frames. */ |
1995 | tcp_cleanup_rbuf(sk, copied); | 2144 | tcp_cleanup_rbuf(sk, copied); |
1996 | 2145 | ||
1997 | release_sock(sk); | 2146 | release_sock(sk); |
2147 | |||
2148 | if (has_cmsg) { | ||
2149 | if (has_tss) | ||
2150 | tcp_recv_timestamp(msg, sk, &tss); | ||
2151 | if (tp->recvmsg_inq) { | ||
2152 | inq = tcp_inq_hint(sk); | ||
2153 | put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq); | ||
2154 | } | ||
2155 | } | ||
2156 | |||
1998 | return copied; | 2157 | return copied; |
1999 | 2158 | ||
2000 | out: | 2159 | out: |
@@ -2411,6 +2570,7 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
2411 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | 2570 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; |
2412 | tp->snd_cwnd_cnt = 0; | 2571 | tp->snd_cwnd_cnt = 0; |
2413 | tp->window_clamp = 0; | 2572 | tp->window_clamp = 0; |
2573 | tp->delivered_ce = 0; | ||
2414 | tcp_set_ca_state(sk, TCP_CA_Open); | 2574 | tcp_set_ca_state(sk, TCP_CA_Open); |
2415 | tp->is_sack_reneg = 0; | 2575 | tp->is_sack_reneg = 0; |
2416 | tcp_clear_retrans(tp); | 2576 | tcp_clear_retrans(tp); |
@@ -2424,6 +2584,7 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
2424 | dst_release(sk->sk_rx_dst); | 2584 | dst_release(sk->sk_rx_dst); |
2425 | sk->sk_rx_dst = NULL; | 2585 | sk->sk_rx_dst = NULL; |
2426 | tcp_saved_syn_free(tp); | 2586 | tcp_saved_syn_free(tp); |
2587 | tp->compressed_ack = 0; | ||
2427 | 2588 | ||
2428 | /* Clean up fastopen related fields */ | 2589 | /* Clean up fastopen related fields */ |
2429 | tcp_free_fastopen_req(tp); | 2590 | tcp_free_fastopen_req(tp); |
@@ -2862,6 +3023,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
2862 | tp->notsent_lowat = val; | 3023 | tp->notsent_lowat = val; |
2863 | sk->sk_write_space(sk); | 3024 | sk->sk_write_space(sk); |
2864 | break; | 3025 | break; |
3026 | case TCP_INQ: | ||
3027 | if (val > 1 || val < 0) | ||
3028 | err = -EINVAL; | ||
3029 | else | ||
3030 | tp->recvmsg_inq = val; | ||
3031 | break; | ||
2865 | default: | 3032 | default: |
2866 | err = -ENOPROTOOPT; | 3033 | err = -ENOPROTOOPT; |
2867 | break; | 3034 | break; |
@@ -3020,6 +3187,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) | |||
3020 | rate64 = tcp_compute_delivery_rate(tp); | 3187 | rate64 = tcp_compute_delivery_rate(tp); |
3021 | if (rate64) | 3188 | if (rate64) |
3022 | info->tcpi_delivery_rate = rate64; | 3189 | info->tcpi_delivery_rate = rate64; |
3190 | info->tcpi_delivered = tp->delivered; | ||
3191 | info->tcpi_delivered_ce = tp->delivered_ce; | ||
3023 | unlock_sock_fast(sk, slow); | 3192 | unlock_sock_fast(sk, slow); |
3024 | } | 3193 | } |
3025 | EXPORT_SYMBOL_GPL(tcp_get_info); | 3194 | EXPORT_SYMBOL_GPL(tcp_get_info); |
@@ -3033,7 +3202,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) | |||
3033 | u32 rate; | 3202 | u32 rate; |
3034 | 3203 | ||
3035 | stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + | 3204 | stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + |
3036 | 5 * nla_total_size(sizeof(u32)) + | 3205 | 7 * nla_total_size(sizeof(u32)) + |
3037 | 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); | 3206 | 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); |
3038 | if (!stats) | 3207 | if (!stats) |
3039 | return NULL; | 3208 | return NULL; |
@@ -3064,9 +3233,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) | |||
3064 | nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); | 3233 | nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); |
3065 | nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); | 3234 | nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); |
3066 | nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); | 3235 | nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); |
3236 | nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); | ||
3237 | nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce); | ||
3067 | 3238 | ||
3068 | nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); | 3239 | nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); |
3069 | nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); | 3240 | nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); |
3241 | |||
3070 | return stats; | 3242 | return stats; |
3071 | } | 3243 | } |
3072 | 3244 | ||
@@ -3282,6 +3454,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | |||
3282 | case TCP_NOTSENT_LOWAT: | 3454 | case TCP_NOTSENT_LOWAT: |
3283 | val = tp->notsent_lowat; | 3455 | val = tp->notsent_lowat; |
3284 | break; | 3456 | break; |
3457 | case TCP_INQ: | ||
3458 | val = tp->recvmsg_inq; | ||
3459 | break; | ||
3285 | case TCP_SAVE_SYN: | 3460 | case TCP_SAVE_SYN: |
3286 | val = tp->save_syn; | 3461 | val = tp->save_syn; |
3287 | break; | 3462 | break; |
@@ -3318,6 +3493,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | |||
3318 | } | 3493 | } |
3319 | return 0; | 3494 | return 0; |
3320 | } | 3495 | } |
3496 | #ifdef CONFIG_MMU | ||
3497 | case TCP_ZEROCOPY_RECEIVE: { | ||
3498 | struct tcp_zerocopy_receive zc; | ||
3499 | int err; | ||
3500 | |||
3501 | if (get_user(len, optlen)) | ||
3502 | return -EFAULT; | ||
3503 | if (len != sizeof(zc)) | ||
3504 | return -EINVAL; | ||
3505 | if (copy_from_user(&zc, optval, len)) | ||
3506 | return -EFAULT; | ||
3507 | lock_sock(sk); | ||
3508 | err = tcp_zerocopy_receive(sk, &zc); | ||
3509 | release_sock(sk); | ||
3510 | if (!err && copy_to_user(optval, &zc, len)) | ||
3511 | err = -EFAULT; | ||
3512 | return err; | ||
3513 | } | ||
3514 | #endif | ||
3321 | default: | 3515 | default: |
3322 | return -ENOPROTOOPT; | 3516 | return -ENOPROTOOPT; |
3323 | } | 3517 | } |