summaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c204
1 files changed, 199 insertions, 5 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dec47e6789e7..2741953adaba 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1691,6 +1691,139 @@ int tcp_peek_len(struct socket *sock)
1691} 1691}
1692EXPORT_SYMBOL(tcp_peek_len); 1692EXPORT_SYMBOL(tcp_peek_len);
1693 1693
1694/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
1695int tcp_set_rcvlowat(struct sock *sk, int val)
1696{
1697 sk->sk_rcvlowat = val ? : 1;
1698
1699 /* Check if we need to signal EPOLLIN right now */
1700 tcp_data_ready(sk);
1701
1702 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1703 return 0;
1704
1705 /* val comes from user space and might be close to INT_MAX */
1706 val <<= 1;
1707 if (val < 0)
1708 val = INT_MAX;
1709
1710 val = min(val, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
1711 if (val > sk->sk_rcvbuf) {
1712 sk->sk_rcvbuf = val;
1713 tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
1714 }
1715 return 0;
1716}
1717EXPORT_SYMBOL(tcp_set_rcvlowat);
1718
1719#ifdef CONFIG_MMU
1720static const struct vm_operations_struct tcp_vm_ops = {
1721};
1722
1723int tcp_mmap(struct file *file, struct socket *sock,
1724 struct vm_area_struct *vma)
1725{
1726 if (vma->vm_flags & (VM_WRITE | VM_EXEC))
1727 return -EPERM;
1728 vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
1729
1730 /* Instruct vm_insert_page() to not down_read(mmap_sem) */
1731 vma->vm_flags |= VM_MIXEDMAP;
1732
1733 vma->vm_ops = &tcp_vm_ops;
1734 return 0;
1735}
1736EXPORT_SYMBOL(tcp_mmap);
1737
1738static int tcp_zerocopy_receive(struct sock *sk,
1739 struct tcp_zerocopy_receive *zc)
1740{
1741 unsigned long address = (unsigned long)zc->address;
1742 const skb_frag_t *frags = NULL;
1743 u32 length = 0, seq, offset;
1744 struct vm_area_struct *vma;
1745 struct sk_buff *skb = NULL;
1746 struct tcp_sock *tp;
1747 int ret;
1748
1749 if (address & (PAGE_SIZE - 1) || address != zc->address)
1750 return -EINVAL;
1751
1752 if (sk->sk_state == TCP_LISTEN)
1753 return -ENOTCONN;
1754
1755 sock_rps_record_flow(sk);
1756
1757 down_read(&current->mm->mmap_sem);
1758
1759 ret = -EINVAL;
1760 vma = find_vma(current->mm, address);
1761 if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
1762 goto out;
1763 zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
1764
1765 tp = tcp_sk(sk);
1766 seq = tp->copied_seq;
1767 zc->length = min_t(u32, zc->length, tcp_inq(sk));
1768 zc->length &= ~(PAGE_SIZE - 1);
1769
1770 zap_page_range(vma, address, zc->length);
1771
1772 zc->recv_skip_hint = 0;
1773 ret = 0;
1774 while (length + PAGE_SIZE <= zc->length) {
1775 if (zc->recv_skip_hint < PAGE_SIZE) {
1776 if (skb) {
1777 skb = skb->next;
1778 offset = seq - TCP_SKB_CB(skb)->seq;
1779 } else {
1780 skb = tcp_recv_skb(sk, seq, &offset);
1781 }
1782
1783 zc->recv_skip_hint = skb->len - offset;
1784 offset -= skb_headlen(skb);
1785 if ((int)offset < 0 || skb_has_frag_list(skb))
1786 break;
1787 frags = skb_shinfo(skb)->frags;
1788 while (offset) {
1789 if (frags->size > offset)
1790 goto out;
1791 offset -= frags->size;
1792 frags++;
1793 }
1794 }
1795 if (frags->size != PAGE_SIZE || frags->page_offset)
1796 break;
1797 ret = vm_insert_page(vma, address + length,
1798 skb_frag_page(frags));
1799 if (ret)
1800 break;
1801 length += PAGE_SIZE;
1802 seq += PAGE_SIZE;
1803 zc->recv_skip_hint -= PAGE_SIZE;
1804 frags++;
1805 }
1806out:
1807 up_read(&current->mm->mmap_sem);
1808 if (length) {
1809 tp->copied_seq = seq;
1810 tcp_rcv_space_adjust(sk);
1811
1812 /* Clean up data we have read: This will do ACK frames. */
1813 tcp_recv_skb(sk, seq, &offset);
1814 tcp_cleanup_rbuf(sk, length);
1815 ret = 0;
1816 if (length == zc->length)
1817 zc->recv_skip_hint = 0;
1818 } else {
1819 if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
1820 ret = -EIO;
1821 }
1822 zc->length = length;
1823 return ret;
1824}
1825#endif
1826
1694static void tcp_update_recv_tstamps(struct sk_buff *skb, 1827static void tcp_update_recv_tstamps(struct sk_buff *skb,
1695 struct scm_timestamping *tss) 1828 struct scm_timestamping *tss)
1696{ 1829{
@@ -1746,6 +1879,22 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
1746 } 1879 }
1747} 1880}
1748 1881
1882static int tcp_inq_hint(struct sock *sk)
1883{
1884 const struct tcp_sock *tp = tcp_sk(sk);
1885 u32 copied_seq = READ_ONCE(tp->copied_seq);
1886 u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
1887 int inq;
1888
1889 inq = rcv_nxt - copied_seq;
1890 if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) {
1891 lock_sock(sk);
1892 inq = tp->rcv_nxt - tp->copied_seq;
1893 release_sock(sk);
1894 }
1895 return inq;
1896}
1897
1749/* 1898/*
1750 * This routine copies from a sock struct into the user buffer. 1899 * This routine copies from a sock struct into the user buffer.
1751 * 1900 *
@@ -1762,13 +1911,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1762 u32 peek_seq; 1911 u32 peek_seq;
1763 u32 *seq; 1912 u32 *seq;
1764 unsigned long used; 1913 unsigned long used;
1765 int err; 1914 int err, inq;
1766 int target; /* Read at least this many bytes */ 1915 int target; /* Read at least this many bytes */
1767 long timeo; 1916 long timeo;
1768 struct sk_buff *skb, *last; 1917 struct sk_buff *skb, *last;
1769 u32 urg_hole = 0; 1918 u32 urg_hole = 0;
1770 struct scm_timestamping tss; 1919 struct scm_timestamping tss;
1771 bool has_tss = false; 1920 bool has_tss = false;
1921 bool has_cmsg;
1772 1922
1773 if (unlikely(flags & MSG_ERRQUEUE)) 1923 if (unlikely(flags & MSG_ERRQUEUE))
1774 return inet_recv_error(sk, msg, len, addr_len); 1924 return inet_recv_error(sk, msg, len, addr_len);
@@ -1783,6 +1933,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
1783 if (sk->sk_state == TCP_LISTEN) 1933 if (sk->sk_state == TCP_LISTEN)
1784 goto out; 1934 goto out;
1785 1935
1936 has_cmsg = tp->recvmsg_inq;
1786 timeo = sock_rcvtimeo(sk, nonblock); 1937 timeo = sock_rcvtimeo(sk, nonblock);
1787 1938
1788 /* Urgent data needs to be handled specially. */ 1939 /* Urgent data needs to be handled specially. */
@@ -1969,6 +2120,7 @@ skip_copy:
1969 if (TCP_SKB_CB(skb)->has_rxtstamp) { 2120 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1970 tcp_update_recv_tstamps(skb, &tss); 2121 tcp_update_recv_tstamps(skb, &tss);
1971 has_tss = true; 2122 has_tss = true;
2123 has_cmsg = true;
1972 } 2124 }
1973 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 2125 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1974 goto found_fin_ok; 2126 goto found_fin_ok;
@@ -1988,13 +2140,20 @@ skip_copy:
1988 * on connected socket. I was just happy when found this 8) --ANK 2140 * on connected socket. I was just happy when found this 8) --ANK
1989 */ 2141 */
1990 2142
1991 if (has_tss)
1992 tcp_recv_timestamp(msg, sk, &tss);
1993
1994 /* Clean up data we have read: This will do ACK frames. */ 2143 /* Clean up data we have read: This will do ACK frames. */
1995 tcp_cleanup_rbuf(sk, copied); 2144 tcp_cleanup_rbuf(sk, copied);
1996 2145
1997 release_sock(sk); 2146 release_sock(sk);
2147
2148 if (has_cmsg) {
2149 if (has_tss)
2150 tcp_recv_timestamp(msg, sk, &tss);
2151 if (tp->recvmsg_inq) {
2152 inq = tcp_inq_hint(sk);
2153 put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
2154 }
2155 }
2156
1998 return copied; 2157 return copied;
1999 2158
2000out: 2159out:
@@ -2411,6 +2570,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2411 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 2570 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
2412 tp->snd_cwnd_cnt = 0; 2571 tp->snd_cwnd_cnt = 0;
2413 tp->window_clamp = 0; 2572 tp->window_clamp = 0;
2573 tp->delivered_ce = 0;
2414 tcp_set_ca_state(sk, TCP_CA_Open); 2574 tcp_set_ca_state(sk, TCP_CA_Open);
2415 tp->is_sack_reneg = 0; 2575 tp->is_sack_reneg = 0;
2416 tcp_clear_retrans(tp); 2576 tcp_clear_retrans(tp);
@@ -2424,6 +2584,7 @@ int tcp_disconnect(struct sock *sk, int flags)
2424 dst_release(sk->sk_rx_dst); 2584 dst_release(sk->sk_rx_dst);
2425 sk->sk_rx_dst = NULL; 2585 sk->sk_rx_dst = NULL;
2426 tcp_saved_syn_free(tp); 2586 tcp_saved_syn_free(tp);
2587 tp->compressed_ack = 0;
2427 2588
2428 /* Clean up fastopen related fields */ 2589 /* Clean up fastopen related fields */
2429 tcp_free_fastopen_req(tp); 2590 tcp_free_fastopen_req(tp);
@@ -2862,6 +3023,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2862 tp->notsent_lowat = val; 3023 tp->notsent_lowat = val;
2863 sk->sk_write_space(sk); 3024 sk->sk_write_space(sk);
2864 break; 3025 break;
3026 case TCP_INQ:
3027 if (val > 1 || val < 0)
3028 err = -EINVAL;
3029 else
3030 tp->recvmsg_inq = val;
3031 break;
2865 default: 3032 default:
2866 err = -ENOPROTOOPT; 3033 err = -ENOPROTOOPT;
2867 break; 3034 break;
@@ -3020,6 +3187,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
3020 rate64 = tcp_compute_delivery_rate(tp); 3187 rate64 = tcp_compute_delivery_rate(tp);
3021 if (rate64) 3188 if (rate64)
3022 info->tcpi_delivery_rate = rate64; 3189 info->tcpi_delivery_rate = rate64;
3190 info->tcpi_delivered = tp->delivered;
3191 info->tcpi_delivered_ce = tp->delivered_ce;
3023 unlock_sock_fast(sk, slow); 3192 unlock_sock_fast(sk, slow);
3024} 3193}
3025EXPORT_SYMBOL_GPL(tcp_get_info); 3194EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3033,7 +3202,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3033 u32 rate; 3202 u32 rate;
3034 3203
3035 stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + 3204 stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
3036 5 * nla_total_size(sizeof(u32)) + 3205 7 * nla_total_size(sizeof(u32)) +
3037 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC); 3206 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
3038 if (!stats) 3207 if (!stats)
3039 return NULL; 3208 return NULL;
@@ -3064,9 +3233,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
3064 nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); 3233 nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
3065 nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); 3234 nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
3066 nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); 3235 nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
3236 nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
3237 nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce);
3067 3238
3068 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); 3239 nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
3069 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); 3240 nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
3241
3070 return stats; 3242 return stats;
3071} 3243}
3072 3244
@@ -3282,6 +3454,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
3282 case TCP_NOTSENT_LOWAT: 3454 case TCP_NOTSENT_LOWAT:
3283 val = tp->notsent_lowat; 3455 val = tp->notsent_lowat;
3284 break; 3456 break;
3457 case TCP_INQ:
3458 val = tp->recvmsg_inq;
3459 break;
3285 case TCP_SAVE_SYN: 3460 case TCP_SAVE_SYN:
3286 val = tp->save_syn; 3461 val = tp->save_syn;
3287 break; 3462 break;
@@ -3318,6 +3493,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
3318 } 3493 }
3319 return 0; 3494 return 0;
3320 } 3495 }
3496#ifdef CONFIG_MMU
3497 case TCP_ZEROCOPY_RECEIVE: {
3498 struct tcp_zerocopy_receive zc;
3499 int err;
3500
3501 if (get_user(len, optlen))
3502 return -EFAULT;
3503 if (len != sizeof(zc))
3504 return -EINVAL;
3505 if (copy_from_user(&zc, optval, len))
3506 return -EFAULT;
3507 lock_sock(sk);
3508 err = tcp_zerocopy_receive(sk, &zc);
3509 release_sock(sk);
3510 if (!err && copy_to_user(optval, &zc, len))
3511 err = -EFAULT;
3512 return err;
3513 }
3514#endif
3321 default: 3515 default:
3322 return -ENOPROTOOPT; 3516 return -ENOPROTOOPT;
3323 } 3517 }