diff options
Diffstat (limited to 'net')
| -rw-r--r-- | net/core/datagram.c | 21 | ||||
| -rw-r--r-- | net/core/dev.c | 12 | ||||
| -rw-r--r-- | net/core/netpoll.c | 18 | ||||
| -rw-r--r-- | net/decnet/af_decnet.c | 14 | ||||
| -rw-r--r-- | net/ipv4/icmp.c | 6 | ||||
| -rw-r--r-- | net/ipv4/igmp.c | 19 | ||||
| -rw-r--r-- | net/ipv4/ip_gre.c | 15 | ||||
| -rw-r--r-- | net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 11 | ||||
| -rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 8 | ||||
| -rw-r--r-- | net/ipv4/tcp.c | 3 | ||||
| -rw-r--r-- | net/ipv4/tcp_bic.c | 12 | ||||
| -rw-r--r-- | net/ipv4/tcp_cong.c | 40 | ||||
| -rw-r--r-- | net/ipv4/tcp_highspeed.c | 11 | ||||
| -rw-r--r-- | net/ipv4/tcp_htcp.c | 13 | ||||
| -rw-r--r-- | net/ipv4/tcp_hybla.c | 6 | ||||
| -rw-r--r-- | net/ipv4/tcp_input.c | 288 | ||||
| -rw-r--r-- | net/ipv4/tcp_ipv4.c | 28 | ||||
| -rw-r--r-- | net/ipv4/tcp_minisocks.c | 7 | ||||
| -rw-r--r-- | net/ipv4/tcp_output.c | 61 | ||||
| -rw-r--r-- | net/ipv4/tcp_scalable.c | 14 | ||||
| -rw-r--r-- | net/ipv4/tcp_timer.c | 4 | ||||
| -rw-r--r-- | net/ipv4/tcp_vegas.c | 42 | ||||
| -rw-r--r-- | net/ipv4/udp.c | 7 | ||||
| -rw-r--r-- | net/ipv6/icmp.c | 21 | ||||
| -rw-r--r-- | net/ipv6/raw.c | 42 | ||||
| -rw-r--r-- | net/ipv6/tcp_ipv6.c | 20 | ||||
| -rw-r--r-- | net/ipv6/udp.c | 25 | ||||
| -rw-r--r-- | net/rxrpc/transport.c | 15 | ||||
| -rw-r--r-- | net/sunrpc/socklib.c | 5 | ||||
| -rw-r--r-- | net/sunrpc/svcsock.c | 9 |
30 files changed, 466 insertions, 331 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c index d219435d086c..1bcfef51ac58 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c | |||
| @@ -350,6 +350,20 @@ fault: | |||
| 350 | return -EFAULT; | 350 | return -EFAULT; |
| 351 | } | 351 | } |
| 352 | 352 | ||
| 353 | unsigned int __skb_checksum_complete(struct sk_buff *skb) | ||
| 354 | { | ||
| 355 | unsigned int sum; | ||
| 356 | |||
| 357 | sum = (u16)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); | ||
| 358 | if (likely(!sum)) { | ||
| 359 | if (unlikely(skb->ip_summed == CHECKSUM_HW)) | ||
| 360 | netdev_rx_csum_fault(skb->dev); | ||
| 361 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
| 362 | } | ||
| 363 | return sum; | ||
| 364 | } | ||
| 365 | EXPORT_SYMBOL(__skb_checksum_complete); | ||
| 366 | |||
| 353 | /** | 367 | /** |
| 354 | * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. | 368 | * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. |
| 355 | * @skb: skbuff | 369 | * @skb: skbuff |
| @@ -363,7 +377,7 @@ fault: | |||
| 363 | * -EFAULT - fault during copy. Beware, in this case iovec | 377 | * -EFAULT - fault during copy. Beware, in this case iovec |
| 364 | * can be modified! | 378 | * can be modified! |
| 365 | */ | 379 | */ |
| 366 | int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, | 380 | int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, |
| 367 | int hlen, struct iovec *iov) | 381 | int hlen, struct iovec *iov) |
| 368 | { | 382 | { |
| 369 | unsigned int csum; | 383 | unsigned int csum; |
| @@ -376,8 +390,7 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, | |||
| 376 | iov++; | 390 | iov++; |
| 377 | 391 | ||
| 378 | if (iov->iov_len < chunk) { | 392 | if (iov->iov_len < chunk) { |
| 379 | if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk + hlen, | 393 | if (__skb_checksum_complete(skb)) |
| 380 | skb->csum))) | ||
| 381 | goto csum_error; | 394 | goto csum_error; |
| 382 | if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) | 395 | if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) |
| 383 | goto fault; | 396 | goto fault; |
| @@ -388,6 +401,8 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, | |||
| 388 | goto fault; | 401 | goto fault; |
| 389 | if ((unsigned short)csum_fold(csum)) | 402 | if ((unsigned short)csum_fold(csum)) |
| 390 | goto csum_error; | 403 | goto csum_error; |
| 404 | if (unlikely(skb->ip_summed == CHECKSUM_HW)) | ||
| 405 | netdev_rx_csum_fault(skb->dev); | ||
| 391 | iov->iov_len -= chunk; | 406 | iov->iov_len -= chunk; |
| 392 | iov->iov_base += chunk; | 407 | iov->iov_base += chunk; |
| 393 | } | 408 | } |
diff --git a/net/core/dev.c b/net/core/dev.c index 8d1541595277..0b48e294aafe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
| @@ -1108,6 +1108,18 @@ out: | |||
| 1108 | return ret; | 1108 | return ret; |
| 1109 | } | 1109 | } |
| 1110 | 1110 | ||
| 1111 | /* Take action when hardware reception checksum errors are detected. */ | ||
| 1112 | #ifdef CONFIG_BUG | ||
| 1113 | void netdev_rx_csum_fault(struct net_device *dev) | ||
| 1114 | { | ||
| 1115 | if (net_ratelimit()) { | ||
| 1116 | printk(KERN_ERR "%s: hw csum failure.\n", dev->name); | ||
| 1117 | dump_stack(); | ||
| 1118 | } | ||
| 1119 | } | ||
| 1120 | EXPORT_SYMBOL(netdev_rx_csum_fault); | ||
| 1121 | #endif | ||
| 1122 | |||
| 1111 | #ifdef CONFIG_HIGHMEM | 1123 | #ifdef CONFIG_HIGHMEM |
| 1112 | /* Actually, we should eliminate this check as soon as we know, that: | 1124 | /* Actually, we should eliminate this check as soon as we know, that: |
| 1113 | * 1. IOMMU is present and allows to map all the memory. | 1125 | * 1. IOMMU is present and allows to map all the memory. |
diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 802fe11efad0..49424a42a2c0 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c | |||
| @@ -101,16 +101,20 @@ void netpoll_queue(struct sk_buff *skb) | |||
| 101 | static int checksum_udp(struct sk_buff *skb, struct udphdr *uh, | 101 | static int checksum_udp(struct sk_buff *skb, struct udphdr *uh, |
| 102 | unsigned short ulen, u32 saddr, u32 daddr) | 102 | unsigned short ulen, u32 saddr, u32 daddr) |
| 103 | { | 103 | { |
| 104 | if (uh->check == 0) | 104 | unsigned int psum; |
| 105 | |||
| 106 | if (uh->check == 0 || skb->ip_summed == CHECKSUM_UNNECESSARY) | ||
| 105 | return 0; | 107 | return 0; |
| 106 | 108 | ||
| 107 | if (skb->ip_summed == CHECKSUM_HW) | 109 | psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); |
| 108 | return csum_tcpudp_magic( | 110 | |
| 109 | saddr, daddr, ulen, IPPROTO_UDP, skb->csum); | 111 | if (skb->ip_summed == CHECKSUM_HW && |
| 112 | !(u16)csum_fold(csum_add(psum, skb->csum))) | ||
| 113 | return 0; | ||
| 110 | 114 | ||
| 111 | skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); | 115 | skb->csum = psum; |
| 112 | 116 | ||
| 113 | return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); | 117 | return __skb_checksum_complete(skb); |
| 114 | } | 118 | } |
| 115 | 119 | ||
| 116 | /* | 120 | /* |
| @@ -489,7 +493,7 @@ int __netpoll_rx(struct sk_buff *skb) | |||
| 489 | 493 | ||
| 490 | if (ulen != len) | 494 | if (ulen != len) |
| 491 | goto out; | 495 | goto out; |
| 492 | if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0) | 496 | if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr)) |
| 493 | goto out; | 497 | goto out; |
| 494 | if (np->local_ip && np->local_ip != ntohl(iph->daddr)) | 498 | if (np->local_ip && np->local_ip != ntohl(iph->daddr)) |
| 495 | goto out; | 499 | goto out; |
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 3f25cadccddd..f89e55f814d9 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c | |||
| @@ -1664,17 +1664,15 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock, | |||
| 1664 | goto out; | 1664 | goto out; |
| 1665 | } | 1665 | } |
| 1666 | 1666 | ||
| 1667 | rv = dn_check_state(sk, NULL, 0, &timeo, flags); | ||
| 1668 | if (rv) | ||
| 1669 | goto out; | ||
| 1670 | |||
| 1671 | if (sk->sk_shutdown & RCV_SHUTDOWN) { | 1667 | if (sk->sk_shutdown & RCV_SHUTDOWN) { |
| 1672 | if (!(flags & MSG_NOSIGNAL)) | 1668 | rv = 0; |
| 1673 | send_sig(SIGPIPE, current, 0); | ||
| 1674 | rv = -EPIPE; | ||
| 1675 | goto out; | 1669 | goto out; |
| 1676 | } | 1670 | } |
| 1677 | 1671 | ||
| 1672 | rv = dn_check_state(sk, NULL, 0, &timeo, flags); | ||
| 1673 | if (rv) | ||
| 1674 | goto out; | ||
| 1675 | |||
| 1678 | if (flags & ~(MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) { | 1676 | if (flags & ~(MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) { |
| 1679 | rv = -EOPNOTSUPP; | 1677 | rv = -EOPNOTSUPP; |
| 1680 | goto out; | 1678 | goto out; |
| @@ -1928,6 +1926,8 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, | |||
| 1928 | 1926 | ||
| 1929 | if (sk->sk_shutdown & SEND_SHUTDOWN) { | 1927 | if (sk->sk_shutdown & SEND_SHUTDOWN) { |
| 1930 | err = -EPIPE; | 1928 | err = -EPIPE; |
| 1929 | if (!(flags & MSG_NOSIGNAL)) | ||
| 1930 | send_sig(SIGPIPE, current, 0); | ||
| 1931 | goto out_err; | 1931 | goto out_err; |
| 1932 | } | 1932 | } |
| 1933 | 1933 | ||
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 175e093ec564..e3eceecd0496 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c | |||
| @@ -934,11 +934,11 @@ int icmp_rcv(struct sk_buff *skb) | |||
| 934 | case CHECKSUM_HW: | 934 | case CHECKSUM_HW: |
| 935 | if (!(u16)csum_fold(skb->csum)) | 935 | if (!(u16)csum_fold(skb->csum)) |
| 936 | break; | 936 | break; |
| 937 | LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n"); | 937 | /* fall through */ |
| 938 | case CHECKSUM_NONE: | 938 | case CHECKSUM_NONE: |
| 939 | if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) | 939 | skb->csum = 0; |
| 940 | if (__skb_checksum_complete(skb)) | ||
| 940 | goto error; | 941 | goto error; |
| 941 | default:; | ||
| 942 | } | 942 | } |
| 943 | 943 | ||
| 944 | if (!pskb_pull(skb, sizeof(struct icmphdr))) | 944 | if (!pskb_pull(skb, sizeof(struct icmphdr))) |
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index c6247fc84060..c04607b49212 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c | |||
| @@ -872,11 +872,18 @@ int igmp_rcv(struct sk_buff *skb) | |||
| 872 | return 0; | 872 | return 0; |
| 873 | } | 873 | } |
| 874 | 874 | ||
| 875 | if (!pskb_may_pull(skb, sizeof(struct igmphdr)) || | 875 | if (!pskb_may_pull(skb, sizeof(struct igmphdr))) |
| 876 | (u16)csum_fold(skb_checksum(skb, 0, len, 0))) { | 876 | goto drop; |
| 877 | in_dev_put(in_dev); | 877 | |
| 878 | kfree_skb(skb); | 878 | switch (skb->ip_summed) { |
| 879 | return 0; | 879 | case CHECKSUM_HW: |
| 880 | if (!(u16)csum_fold(skb->csum)) | ||
| 881 | break; | ||
| 882 | /* fall through */ | ||
| 883 | case CHECKSUM_NONE: | ||
| 884 | skb->csum = 0; | ||
| 885 | if (__skb_checksum_complete(skb)) | ||
| 886 | goto drop; | ||
| 880 | } | 887 | } |
| 881 | 888 | ||
| 882 | ih = skb->h.igmph; | 889 | ih = skb->h.igmph; |
| @@ -906,6 +913,8 @@ int igmp_rcv(struct sk_buff *skb) | |||
| 906 | default: | 913 | default: |
| 907 | NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); | 914 | NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); |
| 908 | } | 915 | } |
| 916 | |||
| 917 | drop: | ||
| 909 | in_dev_put(in_dev); | 918 | in_dev_put(in_dev); |
| 910 | kfree_skb(skb); | 919 | kfree_skb(skb); |
| 911 | return 0; | 920 | return 0; |
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 896ce3f8f53a..4e9c74b54b15 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c | |||
| @@ -577,15 +577,16 @@ static int ipgre_rcv(struct sk_buff *skb) | |||
| 577 | goto drop_nolock; | 577 | goto drop_nolock; |
| 578 | 578 | ||
| 579 | if (flags&GRE_CSUM) { | 579 | if (flags&GRE_CSUM) { |
| 580 | if (skb->ip_summed == CHECKSUM_HW) { | 580 | switch (skb->ip_summed) { |
| 581 | case CHECKSUM_HW: | ||
| 581 | csum = (u16)csum_fold(skb->csum); | 582 | csum = (u16)csum_fold(skb->csum); |
| 582 | if (csum) | 583 | if (!csum) |
| 583 | skb->ip_summed = CHECKSUM_NONE; | 584 | break; |
| 584 | } | 585 | /* fall through */ |
| 585 | if (skb->ip_summed == CHECKSUM_NONE) { | 586 | case CHECKSUM_NONE: |
| 586 | skb->csum = skb_checksum(skb, 0, skb->len, 0); | 587 | skb->csum = 0; |
| 588 | csum = __skb_checksum_complete(skb); | ||
| 587 | skb->ip_summed = CHECKSUM_HW; | 589 | skb->ip_summed = CHECKSUM_HW; |
| 588 | csum = (u16)csum_fold(skb->csum); | ||
| 589 | } | 590 | } |
| 590 | offset += 4; | 591 | offset += 4; |
| 591 | } | 592 | } |
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 5198f3a1e2cd..e4d6b268e8c4 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/in.h> | 13 | #include <linux/in.h> |
| 14 | #include <linux/icmp.h> | 14 | #include <linux/icmp.h> |
| 15 | #include <linux/seq_file.h> | 15 | #include <linux/seq_file.h> |
| 16 | #include <linux/skbuff.h> | ||
| 16 | #include <net/ip.h> | 17 | #include <net/ip.h> |
| 17 | #include <net/checksum.h> | 18 | #include <net/checksum.h> |
| 18 | #include <linux/netfilter.h> | 19 | #include <linux/netfilter.h> |
| @@ -230,19 +231,15 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, | |||
| 230 | case CHECKSUM_HW: | 231 | case CHECKSUM_HW: |
| 231 | if (!(u16)csum_fold(skb->csum)) | 232 | if (!(u16)csum_fold(skb->csum)) |
| 232 | break; | 233 | break; |
| 233 | if (LOG_INVALID(IPPROTO_ICMP)) | 234 | /* fall through */ |
| 234 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | ||
| 235 | "ip_ct_icmp: bad HW ICMP checksum "); | ||
| 236 | return -NF_ACCEPT; | ||
| 237 | case CHECKSUM_NONE: | 235 | case CHECKSUM_NONE: |
| 238 | if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { | 236 | skb->csum = 0; |
| 237 | if (__skb_checksum_complete(skb)) { | ||
| 239 | if (LOG_INVALID(IPPROTO_ICMP)) | 238 | if (LOG_INVALID(IPPROTO_ICMP)) |
| 240 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | 239 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
| 241 | "ip_ct_icmp: bad ICMP checksum "); | 240 | "ip_ct_icmp: bad ICMP checksum "); |
| 242 | return -NF_ACCEPT; | 241 | return -NF_ACCEPT; |
| 243 | } | 242 | } |
| 244 | default: | ||
| 245 | break; | ||
| 246 | } | 243 | } |
| 247 | 244 | ||
| 248 | checksum_skipped: | 245 | checksum_skipped: |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 652685623519..01444a02b48b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
| @@ -645,6 +645,14 @@ ctl_table ipv4_table[] = { | |||
| 645 | .proc_handler = &proc_tcp_congestion_control, | 645 | .proc_handler = &proc_tcp_congestion_control, |
| 646 | .strategy = &sysctl_tcp_congestion_control, | 646 | .strategy = &sysctl_tcp_congestion_control, |
| 647 | }, | 647 | }, |
| 648 | { | ||
| 649 | .ctl_name = NET_TCP_ABC, | ||
| 650 | .procname = "tcp_abc", | ||
| 651 | .data = &sysctl_tcp_abc, | ||
| 652 | .maxlen = sizeof(int), | ||
| 653 | .mode = 0644, | ||
| 654 | .proc_handler = &proc_dointvec, | ||
| 655 | }, | ||
| 648 | 656 | ||
| 649 | { .ctl_name = 0 } | 657 | { .ctl_name = 0 } |
| 650 | }; | 658 | }; |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 72b7c22e1ea5..9ac7a4f46bd8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
| @@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
| 1640 | } else if (tcp_need_reset(old_state) || | 1640 | } else if (tcp_need_reset(old_state) || |
| 1641 | (tp->snd_nxt != tp->write_seq && | 1641 | (tp->snd_nxt != tp->write_seq && |
| 1642 | (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { | 1642 | (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { |
| 1643 | /* The last check adjusts for discrepance of Linux wrt. RFC | 1643 | /* The last check adjusts for discrepancy of Linux wrt. RFC |
| 1644 | * states | 1644 | * states |
| 1645 | */ | 1645 | */ |
| 1646 | tcp_send_active_reset(sk, gfp_any()); | 1646 | tcp_send_active_reset(sk, gfp_any()); |
| @@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
| 1669 | tp->packets_out = 0; | 1669 | tp->packets_out = 0; |
| 1670 | tp->snd_ssthresh = 0x7fffffff; | 1670 | tp->snd_ssthresh = 0x7fffffff; |
| 1671 | tp->snd_cwnd_cnt = 0; | 1671 | tp->snd_cwnd_cnt = 0; |
| 1672 | tp->bytes_acked = 0; | ||
| 1672 | tcp_set_ca_state(sk, TCP_CA_Open); | 1673 | tcp_set_ca_state(sk, TCP_CA_Open); |
| 1673 | tcp_clear_retrans(tp); | 1674 | tcp_clear_retrans(tp); |
| 1674 | inet_csk_delack_init(sk); | 1675 | inet_csk_delack_init(sk); |
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index ae35e0609047..1d0cd86621b1 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c | |||
| @@ -217,17 +217,15 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, | |||
| 217 | 217 | ||
| 218 | bictcp_low_utilization(sk, data_acked); | 218 | bictcp_low_utilization(sk, data_acked); |
| 219 | 219 | ||
| 220 | if (in_flight < tp->snd_cwnd) | 220 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
| 221 | return; | 221 | return; |
| 222 | 222 | ||
| 223 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 223 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
| 224 | /* In "safe" area, increase. */ | 224 | tcp_slow_start(tp); |
| 225 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 225 | else { |
| 226 | tp->snd_cwnd++; | ||
| 227 | } else { | ||
| 228 | bictcp_update(ca, tp->snd_cwnd); | 226 | bictcp_update(ca, tp->snd_cwnd); |
| 229 | 227 | ||
| 230 | /* In dangerous area, increase slowly. | 228 | /* In dangerous area, increase slowly. |
| 231 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | 229 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd |
| 232 | */ | 230 | */ |
| 233 | if (tp->snd_cwnd_cnt >= ca->cnt) { | 231 | if (tp->snd_cwnd_cnt >= ca->cnt) { |
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index bbf2d6624e89..c7cc62c8dc12 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c | |||
| @@ -186,24 +186,32 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, | |||
| 186 | { | 186 | { |
| 187 | struct tcp_sock *tp = tcp_sk(sk); | 187 | struct tcp_sock *tp = tcp_sk(sk); |
| 188 | 188 | ||
| 189 | if (in_flight < tp->snd_cwnd) | 189 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
| 190 | return; | 190 | return; |
| 191 | 191 | ||
| 192 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 192 | /* In "safe" area, increase. */ |
| 193 | /* In "safe" area, increase. */ | 193 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
| 194 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 194 | tcp_slow_start(tp); |
| 195 | tp->snd_cwnd++; | 195 | |
| 196 | } else { | 196 | /* In dangerous area, increase slowly. */ |
| 197 | /* In dangerous area, increase slowly. | 197 | else if (sysctl_tcp_abc) { |
| 198 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | 198 | /* RFC3465: Apppriate Byte Count |
| 199 | */ | 199 | * increase once for each full cwnd acked |
| 200 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | 200 | */ |
| 201 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 201 | if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { |
| 202 | tp->snd_cwnd++; | 202 | tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; |
| 203 | tp->snd_cwnd_cnt = 0; | 203 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) |
| 204 | } else | 204 | tp->snd_cwnd++; |
| 205 | tp->snd_cwnd_cnt++; | 205 | } |
| 206 | } | 206 | } else { |
| 207 | /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ | ||
| 208 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | ||
| 209 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 210 | tp->snd_cwnd++; | ||
| 211 | tp->snd_cwnd_cnt = 0; | ||
| 212 | } else | ||
| 213 | tp->snd_cwnd_cnt++; | ||
| 214 | } | ||
| 207 | } | 215 | } |
| 208 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); | 216 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); |
| 209 | 217 | ||
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 6acc04bde080..82b3c189bd7d 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c | |||
| @@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk) | |||
| 111 | } | 111 | } |
| 112 | 112 | ||
| 113 | static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, | 113 | static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, |
| 114 | u32 in_flight, int good) | 114 | u32 in_flight, u32 pkts_acked) |
| 115 | { | 115 | { |
| 116 | struct tcp_sock *tp = tcp_sk(sk); | 116 | struct tcp_sock *tp = tcp_sk(sk); |
| 117 | struct hstcp *ca = inet_csk_ca(sk); | 117 | struct hstcp *ca = inet_csk_ca(sk); |
| 118 | 118 | ||
| 119 | if (in_flight < tp->snd_cwnd) | 119 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
| 120 | return; | 120 | return; |
| 121 | 121 | ||
| 122 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 122 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
| 123 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 123 | tcp_slow_start(tp); |
| 124 | tp->snd_cwnd++; | 124 | else { |
| 125 | } else { | ||
| 126 | /* Update AIMD parameters */ | 125 | /* Update AIMD parameters */ |
| 127 | if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { | 126 | if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { |
| 128 | while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && | 127 | while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && |
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index e47b37984e95..3284cfb993e6 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c | |||
| @@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | |||
| 207 | struct tcp_sock *tp = tcp_sk(sk); | 207 | struct tcp_sock *tp = tcp_sk(sk); |
| 208 | struct htcp *ca = inet_csk_ca(sk); | 208 | struct htcp *ca = inet_csk_ca(sk); |
| 209 | 209 | ||
| 210 | if (in_flight < tp->snd_cwnd) | 210 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
| 211 | return; | 211 | return; |
| 212 | 212 | ||
| 213 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 213 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
| 214 | /* In "safe" area, increase. */ | 214 | tcp_slow_start(tp); |
| 215 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 215 | else { |
| 216 | tp->snd_cwnd++; | 216 | |
| 217 | } else { | ||
| 218 | measure_rtt(sk); | 217 | measure_rtt(sk); |
| 219 | 218 | ||
| 220 | /* keep track of number of round-trip times since last backoff event */ | 219 | /* keep track of number of round-trip times since last backoff event */ |
| @@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | |||
| 224 | htcp_alpha_update(ca); | 223 | htcp_alpha_update(ca); |
| 225 | } | 224 | } |
| 226 | 225 | ||
| 227 | /* In dangerous area, increase slowly. | 226 | /* In dangerous area, increase slowly. |
| 228 | * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd | 227 | * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd |
| 229 | */ | 228 | */ |
| 230 | if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { | 229 | if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { |
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 77add63623df..40dbb3877510 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c | |||
| @@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | |||
| 100 | ca->minrtt = tp->srtt; | 100 | ca->minrtt = tp->srtt; |
| 101 | } | 101 | } |
| 102 | 102 | ||
| 103 | if (!tcp_is_cwnd_limited(sk, in_flight)) | ||
| 104 | return; | ||
| 105 | |||
| 103 | if (!ca->hybla_en) | 106 | if (!ca->hybla_en) |
| 104 | return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); | 107 | return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); |
| 105 | 108 | ||
| 106 | if (in_flight < tp->snd_cwnd) | ||
| 107 | return; | ||
| 108 | |||
| 109 | if (ca->rho == 0) | 109 | if (ca->rho == 0) |
| 110 | hybla_recalc_param(sk); | 110 | hybla_recalc_param(sk); |
| 111 | 111 | ||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3e98b57578dc..40a26b7157b4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
| @@ -42,7 +42,7 @@ | |||
| 42 | * Andi Kleen : Moved open_request checking here | 42 | * Andi Kleen : Moved open_request checking here |
| 43 | * and process RSTs for open_requests. | 43 | * and process RSTs for open_requests. |
| 44 | * Andi Kleen : Better prune_queue, and other fixes. | 44 | * Andi Kleen : Better prune_queue, and other fixes. |
| 45 | * Andrey Savochkin: Fix RTT measurements in the presnce of | 45 | * Andrey Savochkin: Fix RTT measurements in the presence of |
| 46 | * timestamps. | 46 | * timestamps. |
| 47 | * Andrey Savochkin: Check sequence numbers correctly when | 47 | * Andrey Savochkin: Check sequence numbers correctly when |
| 48 | * removing SACKs due to in sequence incoming | 48 | * removing SACKs due to in sequence incoming |
| @@ -89,6 +89,7 @@ int sysctl_tcp_frto; | |||
| 89 | int sysctl_tcp_nometrics_save; | 89 | int sysctl_tcp_nometrics_save; |
| 90 | 90 | ||
| 91 | int sysctl_tcp_moderate_rcvbuf = 1; | 91 | int sysctl_tcp_moderate_rcvbuf = 1; |
| 92 | int sysctl_tcp_abc = 1; | ||
| 92 | 93 | ||
| 93 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ | 94 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
| 94 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | 95 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
| @@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk) | |||
| 223 | * of receiver window. Check #2. | 224 | * of receiver window. Check #2. |
| 224 | * | 225 | * |
| 225 | * The scheme does not work when sender sends good segments opening | 226 | * The scheme does not work when sender sends good segments opening |
| 226 | * window and then starts to feed us spagetti. But it should work | 227 | * window and then starts to feed us spaghetti. But it should work |
| 227 | * in common situations. Otherwise, we have to rely on queue collapsing. | 228 | * in common situations. Otherwise, we have to rely on queue collapsing. |
| 228 | */ | 229 | */ |
| 229 | 230 | ||
| @@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, | |||
| 233 | { | 234 | { |
| 234 | /* Optimize this! */ | 235 | /* Optimize this! */ |
| 235 | int truesize = tcp_win_from_space(skb->truesize)/2; | 236 | int truesize = tcp_win_from_space(skb->truesize)/2; |
| 236 | int window = tcp_full_space(sk)/2; | 237 | int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; |
| 237 | 238 | ||
| 238 | while (tp->rcv_ssthresh <= window) { | 239 | while (tp->rcv_ssthresh <= window) { |
| 239 | if (truesize <= skb->len) | 240 | if (truesize <= skb->len) |
| @@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) | |||
| 277 | int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); | 278 | int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); |
| 278 | 279 | ||
| 279 | /* Try to select rcvbuf so that 4 mss-sized segments | 280 | /* Try to select rcvbuf so that 4 mss-sized segments |
| 280 | * will fit to window and correspoding skbs will fit to our rcvbuf. | 281 | * will fit to window and corresponding skbs will fit to our rcvbuf. |
| 281 | * (was 3; 4 is minimum to allow fast retransmit to work.) | 282 | * (was 3; 4 is minimum to allow fast retransmit to work.) |
| 282 | */ | 283 | */ |
| 283 | while (tcp_win_from_space(rcvmem) < tp->advmss) | 284 | while (tcp_win_from_space(rcvmem) < tp->advmss) |
| @@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) | |||
| 286 | sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); | 287 | sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); |
| 287 | } | 288 | } |
| 288 | 289 | ||
| 289 | /* 4. Try to fixup all. It is made iimediately after connection enters | 290 | /* 4. Try to fixup all. It is made immediately after connection enters |
| 290 | * established state. | 291 | * established state. |
| 291 | */ | 292 | */ |
| 292 | static void tcp_init_buffer_space(struct sock *sk) | 293 | static void tcp_init_buffer_space(struct sock *sk) |
| @@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk) | |||
| 326 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) | 327 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) |
| 327 | { | 328 | { |
| 328 | struct inet_connection_sock *icsk = inet_csk(sk); | 329 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 329 | struct sk_buff *skb; | ||
| 330 | unsigned int app_win = tp->rcv_nxt - tp->copied_seq; | ||
| 331 | int ofo_win = 0; | ||
| 332 | 330 | ||
| 333 | icsk->icsk_ack.quick = 0; | 331 | icsk->icsk_ack.quick = 0; |
| 334 | 332 | ||
| 335 | skb_queue_walk(&tp->out_of_order_queue, skb) { | 333 | if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && |
| 336 | ofo_win += skb->len; | 334 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && |
| 337 | } | 335 | !tcp_memory_pressure && |
| 338 | 336 | atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { | |
| 339 | /* If overcommit is due to out of order segments, | 337 | sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), |
| 340 | * do not clamp window. Try to expand rcvbuf instead. | 338 | sysctl_tcp_rmem[2]); |
| 341 | */ | ||
| 342 | if (ofo_win) { | ||
| 343 | if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && | ||
| 344 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && | ||
| 345 | !tcp_memory_pressure && | ||
| 346 | atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) | ||
| 347 | sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), | ||
| 348 | sysctl_tcp_rmem[2]); | ||
| 349 | } | 339 | } |
| 350 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { | 340 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) |
| 351 | app_win += ofo_win; | ||
| 352 | if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) | ||
| 353 | app_win >>= 1; | ||
| 354 | if (app_win > icsk->icsk_ack.rcv_mss) | ||
| 355 | app_win -= icsk->icsk_ack.rcv_mss; | ||
| 356 | app_win = max(app_win, 2U*tp->advmss); | ||
| 357 | |||
| 358 | tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); | 341 | tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); |
| 359 | } | ||
| 360 | } | 342 | } |
| 361 | 343 | ||
| 362 | /* Receiver "autotuning" code. | 344 | /* Receiver "autotuning" code. |
| @@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) | |||
| 385 | * are stalled on filesystem I/O. | 367 | * are stalled on filesystem I/O. |
| 386 | * | 368 | * |
| 387 | * Also, since we are only going for a minimum in the | 369 | * Also, since we are only going for a minimum in the |
| 388 | * non-timestamp case, we do not smoothe things out | 370 | * non-timestamp case, we do not smoother things out |
| 389 | * else with timestamps disabled convergance takes too | 371 | * else with timestamps disabled convergence takes too |
| 390 | * long. | 372 | * long. |
| 391 | */ | 373 | */ |
| 392 | if (!win_dep) { | 374 | if (!win_dep) { |
| @@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) | |||
| 395 | } else if (m < new_sample) | 377 | } else if (m < new_sample) |
| 396 | new_sample = m << 3; | 378 | new_sample = m << 3; |
| 397 | } else { | 379 | } else { |
| 398 | /* No previous mesaure. */ | 380 | /* No previous measure. */ |
| 399 | new_sample = m << 3; | 381 | new_sample = m << 3; |
| 400 | } | 382 | } |
| 401 | 383 | ||
| @@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
| 524 | if (icsk->icsk_ack.ato > icsk->icsk_rto) | 506 | if (icsk->icsk_ack.ato > icsk->icsk_rto) |
| 525 | icsk->icsk_ack.ato = icsk->icsk_rto; | 507 | icsk->icsk_ack.ato = icsk->icsk_rto; |
| 526 | } else if (m > icsk->icsk_rto) { | 508 | } else if (m > icsk->icsk_rto) { |
| 527 | /* Too long gap. Apparently sender falled to | 509 | /* Too long gap. Apparently sender failed to |
| 528 | * restart window, so that we send ACKs quickly. | 510 | * restart window, so that we send ACKs quickly. |
| 529 | */ | 511 | */ |
| 530 | tcp_incr_quickack(sk); | 512 | tcp_incr_quickack(sk); |
| @@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
| 548 | * To save cycles in the RFC 1323 implementation it was better to break | 530 | * To save cycles in the RFC 1323 implementation it was better to break |
| 549 | * it up into three procedures. -- erics | 531 | * it up into three procedures. -- erics |
| 550 | */ | 532 | */ |
| 551 | static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) | 533 | static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) |
| 552 | { | 534 | { |
| 553 | struct tcp_sock *tp = tcp_sk(sk); | 535 | struct tcp_sock *tp = tcp_sk(sk); |
| 554 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 555 | long m = mrtt; /* RTT */ | 536 | long m = mrtt; /* RTT */ |
| 556 | 537 | ||
| 557 | /* The following amusing code comes from Jacobson's | 538 | /* The following amusing code comes from Jacobson's |
| @@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) | |||
| 565 | * | 546 | * |
| 566 | * Funny. This algorithm seems to be very broken. | 547 | * Funny. This algorithm seems to be very broken. |
| 567 | * These formulae increase RTO, when it should be decreased, increase | 548 | * These formulae increase RTO, when it should be decreased, increase |
| 568 | * too slowly, when it should be incresed fastly, decrease too fastly | 549 | * too slowly, when it should be increased fastly, decrease too fastly |
| 569 | * etc. I guess in BSD RTO takes ONE value, so that it is absolutely | 550 | * etc. I guess in BSD RTO takes ONE value, so that it is absolutely |
| 570 | * does not matter how to _calculate_ it. Seems, it was trap | 551 | * does not matter how to _calculate_ it. Seems, it was trap |
| 571 | * that VJ failed to avoid. 8) | 552 | * that VJ failed to avoid. 8) |
| @@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) | |||
| 610 | tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); | 591 | tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); |
| 611 | tp->rtt_seq = tp->snd_nxt; | 592 | tp->rtt_seq = tp->snd_nxt; |
| 612 | } | 593 | } |
| 613 | |||
| 614 | if (icsk->icsk_ca_ops->rtt_sample) | ||
| 615 | icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); | ||
| 616 | } | 594 | } |
| 617 | 595 | ||
| 618 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 596 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
| @@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk) | |||
| 629 | * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ | 607 | * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ |
| 630 | * to do with delayed acks, because at cwnd>2 true delack timeout | 608 | * to do with delayed acks, because at cwnd>2 true delack timeout |
| 631 | * is invisible. Actually, Linux-2.4 also generates erratic | 609 | * is invisible. Actually, Linux-2.4 also generates erratic |
| 632 | * ACKs in some curcumstances. | 610 | * ACKs in some circumstances. |
| 633 | */ | 611 | */ |
| 634 | inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; | 612 | inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; |
| 635 | 613 | ||
| 636 | /* 2. Fixups made earlier cannot be right. | 614 | /* 2. Fixups made earlier cannot be right. |
| 637 | * If we do not estimate RTO correctly without them, | 615 | * If we do not estimate RTO correctly without them, |
| 638 | * all the algo is pure shit and should be replaced | 616 | * all the algo is pure shit and should be replaced |
| 639 | * with correct one. It is exaclty, which we pretend to do. | 617 | * with correct one. It is exactly, which we pretend to do. |
| 640 | */ | 618 | */ |
| 641 | } | 619 | } |
| 642 | 620 | ||
| @@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk) | |||
| 794 | * to make it more realistic. | 772 | * to make it more realistic. |
| 795 | * | 773 | * |
| 796 | * A bit of theory. RTT is time passed after "normal" sized packet | 774 | * A bit of theory. RTT is time passed after "normal" sized packet |
| 797 | * is sent until it is ACKed. In normal curcumstances sending small | 775 | * is sent until it is ACKed. In normal circumstances sending small |
| 798 | * packets force peer to delay ACKs and calculation is correct too. | 776 | * packets force peer to delay ACKs and calculation is correct too. |
| 799 | * The algorithm is adaptive and, provided we follow specs, it | 777 | * The algorithm is adaptive and, provided we follow specs, it |
| 800 | * NEVER underestimate RTT. BUT! If peer tries to make some clever | 778 | * NEVER underestimate RTT. BUT! If peer tries to make some clever |
| @@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
| 919 | int prior_fackets; | 897 | int prior_fackets; |
| 920 | u32 lost_retrans = 0; | 898 | u32 lost_retrans = 0; |
| 921 | int flag = 0; | 899 | int flag = 0; |
| 900 | int dup_sack = 0; | ||
| 922 | int i; | 901 | int i; |
| 923 | 902 | ||
| 924 | if (!tp->sacked_out) | 903 | if (!tp->sacked_out) |
| 925 | tp->fackets_out = 0; | 904 | tp->fackets_out = 0; |
| 926 | prior_fackets = tp->fackets_out; | 905 | prior_fackets = tp->fackets_out; |
| 927 | 906 | ||
| 928 | for (i=0; i<num_sacks; i++, sp++) { | 907 | /* SACK fastpath: |
| 929 | struct sk_buff *skb; | 908 | * if the only SACK change is the increase of the end_seq of |
| 930 | __u32 start_seq = ntohl(sp->start_seq); | 909 | * the first block then only apply that SACK block |
| 931 | __u32 end_seq = ntohl(sp->end_seq); | 910 | * and use retrans queue hinting otherwise slowpath */ |
| 932 | int fack_count = 0; | 911 | flag = 1; |
| 933 | int dup_sack = 0; | 912 | for (i = 0; i< num_sacks; i++) { |
| 913 | __u32 start_seq = ntohl(sp[i].start_seq); | ||
| 914 | __u32 end_seq = ntohl(sp[i].end_seq); | ||
| 915 | |||
| 916 | if (i == 0){ | ||
| 917 | if (tp->recv_sack_cache[i].start_seq != start_seq) | ||
| 918 | flag = 0; | ||
| 919 | } else { | ||
| 920 | if ((tp->recv_sack_cache[i].start_seq != start_seq) || | ||
| 921 | (tp->recv_sack_cache[i].end_seq != end_seq)) | ||
| 922 | flag = 0; | ||
| 923 | } | ||
| 924 | tp->recv_sack_cache[i].start_seq = start_seq; | ||
| 925 | tp->recv_sack_cache[i].end_seq = end_seq; | ||
| 934 | 926 | ||
| 935 | /* Check for D-SACK. */ | 927 | /* Check for D-SACK. */ |
| 936 | if (i == 0) { | 928 | if (i == 0) { |
| @@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
| 962 | if (before(ack, prior_snd_una - tp->max_window)) | 954 | if (before(ack, prior_snd_una - tp->max_window)) |
| 963 | return 0; | 955 | return 0; |
| 964 | } | 956 | } |
| 957 | } | ||
| 958 | |||
| 959 | if (flag) | ||
| 960 | num_sacks = 1; | ||
| 961 | else { | ||
| 962 | int j; | ||
| 963 | tp->fastpath_skb_hint = NULL; | ||
| 964 | |||
| 965 | /* order SACK blocks to allow in order walk of the retrans queue */ | ||
| 966 | for (i = num_sacks-1; i > 0; i--) { | ||
| 967 | for (j = 0; j < i; j++){ | ||
| 968 | if (after(ntohl(sp[j].start_seq), | ||
| 969 | ntohl(sp[j+1].start_seq))){ | ||
| 970 | sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq); | ||
| 971 | sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq); | ||
| 972 | sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq); | ||
| 973 | sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq); | ||
| 974 | } | ||
| 975 | |||
| 976 | } | ||
| 977 | } | ||
| 978 | } | ||
| 979 | |||
| 980 | /* clear flag as used for different purpose in following code */ | ||
| 981 | flag = 0; | ||
| 982 | |||
| 983 | for (i=0; i<num_sacks; i++, sp++) { | ||
| 984 | struct sk_buff *skb; | ||
| 985 | __u32 start_seq = ntohl(sp->start_seq); | ||
| 986 | __u32 end_seq = ntohl(sp->end_seq); | ||
| 987 | int fack_count; | ||
| 988 | |||
| 989 | /* Use SACK fastpath hint if valid */ | ||
| 990 | if (tp->fastpath_skb_hint) { | ||
| 991 | skb = tp->fastpath_skb_hint; | ||
| 992 | fack_count = tp->fastpath_cnt_hint; | ||
| 993 | } else { | ||
| 994 | skb = sk->sk_write_queue.next; | ||
| 995 | fack_count = 0; | ||
| 996 | } | ||
| 965 | 997 | ||
| 966 | /* Event "B" in the comment above. */ | 998 | /* Event "B" in the comment above. */ |
| 967 | if (after(end_seq, tp->high_seq)) | 999 | if (after(end_seq, tp->high_seq)) |
| 968 | flag |= FLAG_DATA_LOST; | 1000 | flag |= FLAG_DATA_LOST; |
| 969 | 1001 | ||
| 970 | sk_stream_for_retrans_queue(skb, sk) { | 1002 | sk_stream_for_retrans_queue_from(skb, sk) { |
| 971 | int in_sack, pcount; | 1003 | int in_sack, pcount; |
| 972 | u8 sacked; | 1004 | u8 sacked; |
| 973 | 1005 | ||
| 1006 | tp->fastpath_skb_hint = skb; | ||
| 1007 | tp->fastpath_cnt_hint = fack_count; | ||
| 1008 | |||
| 974 | /* The retransmission queue is always in order, so | 1009 | /* The retransmission queue is always in order, so |
| 975 | * we can short-circuit the walk early. | 1010 | * we can short-circuit the walk early. |
| 976 | */ | 1011 | */ |
| @@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
| 1045 | TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); | 1080 | TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); |
| 1046 | tp->lost_out -= tcp_skb_pcount(skb); | 1081 | tp->lost_out -= tcp_skb_pcount(skb); |
| 1047 | tp->retrans_out -= tcp_skb_pcount(skb); | 1082 | tp->retrans_out -= tcp_skb_pcount(skb); |
| 1083 | |||
| 1084 | /* clear lost hint */ | ||
| 1085 | tp->retransmit_skb_hint = NULL; | ||
| 1048 | } | 1086 | } |
| 1049 | } else { | 1087 | } else { |
| 1050 | /* New sack for not retransmitted frame, | 1088 | /* New sack for not retransmitted frame, |
| @@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
| 1057 | if (sacked & TCPCB_LOST) { | 1095 | if (sacked & TCPCB_LOST) { |
| 1058 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; | 1096 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; |
| 1059 | tp->lost_out -= tcp_skb_pcount(skb); | 1097 | tp->lost_out -= tcp_skb_pcount(skb); |
| 1098 | |||
| 1099 | /* clear lost hint */ | ||
| 1100 | tp->retransmit_skb_hint = NULL; | ||
| 1060 | } | 1101 | } |
| 1061 | } | 1102 | } |
| 1062 | 1103 | ||
| @@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
| 1080 | (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { | 1121 | (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { |
| 1081 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | 1122 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
| 1082 | tp->retrans_out -= tcp_skb_pcount(skb); | 1123 | tp->retrans_out -= tcp_skb_pcount(skb); |
| 1124 | tp->retransmit_skb_hint = NULL; | ||
| 1083 | } | 1125 | } |
| 1084 | } | 1126 | } |
| 1085 | } | 1127 | } |
| @@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
| 1107 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | 1149 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
| 1108 | tp->retrans_out -= tcp_skb_pcount(skb); | 1150 | tp->retrans_out -= tcp_skb_pcount(skb); |
| 1109 | 1151 | ||
| 1152 | /* clear lost hint */ | ||
| 1153 | tp->retransmit_skb_hint = NULL; | ||
| 1154 | |||
| 1110 | if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { | 1155 | if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { |
| 1111 | tp->lost_out += tcp_skb_pcount(skb); | 1156 | tp->lost_out += tcp_skb_pcount(skb); |
| 1112 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 1157 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
| @@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk) | |||
| 1214 | tcp_set_ca_state(sk, TCP_CA_Loss); | 1259 | tcp_set_ca_state(sk, TCP_CA_Loss); |
| 1215 | tp->high_seq = tp->frto_highmark; | 1260 | tp->high_seq = tp->frto_highmark; |
| 1216 | TCP_ECN_queue_cwr(tp); | 1261 | TCP_ECN_queue_cwr(tp); |
| 1262 | |||
| 1263 | clear_all_retrans_hints(tp); | ||
| 1217 | } | 1264 | } |
| 1218 | 1265 | ||
| 1219 | void tcp_clear_retrans(struct tcp_sock *tp) | 1266 | void tcp_clear_retrans(struct tcp_sock *tp) |
| @@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
| 1251 | tp->snd_cwnd_cnt = 0; | 1298 | tp->snd_cwnd_cnt = 0; |
| 1252 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1299 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 1253 | 1300 | ||
| 1301 | tp->bytes_acked = 0; | ||
| 1254 | tcp_clear_retrans(tp); | 1302 | tcp_clear_retrans(tp); |
| 1255 | 1303 | ||
| 1256 | /* Push undo marker, if it was plain RTO and nothing | 1304 | /* Push undo marker, if it was plain RTO and nothing |
| @@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
| 1279 | tcp_set_ca_state(sk, TCP_CA_Loss); | 1327 | tcp_set_ca_state(sk, TCP_CA_Loss); |
| 1280 | tp->high_seq = tp->snd_nxt; | 1328 | tp->high_seq = tp->snd_nxt; |
| 1281 | TCP_ECN_queue_cwr(tp); | 1329 | TCP_ECN_queue_cwr(tp); |
| 1330 | |||
| 1331 | clear_all_retrans_hints(tp); | ||
| 1282 | } | 1332 | } |
| 1283 | 1333 | ||
| 1284 | static int tcp_check_sack_reneging(struct sock *sk) | 1334 | static int tcp_check_sack_reneging(struct sock *sk) |
| @@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, | |||
| 1503 | int packets, u32 high_seq) | 1553 | int packets, u32 high_seq) |
| 1504 | { | 1554 | { |
| 1505 | struct sk_buff *skb; | 1555 | struct sk_buff *skb; |
| 1506 | int cnt = packets; | 1556 | int cnt; |
| 1507 | 1557 | ||
| 1508 | BUG_TRAP(cnt <= tp->packets_out); | 1558 | BUG_TRAP(packets <= tp->packets_out); |
| 1559 | if (tp->lost_skb_hint) { | ||
| 1560 | skb = tp->lost_skb_hint; | ||
| 1561 | cnt = tp->lost_cnt_hint; | ||
| 1562 | } else { | ||
| 1563 | skb = sk->sk_write_queue.next; | ||
| 1564 | cnt = 0; | ||
| 1565 | } | ||
| 1509 | 1566 | ||
| 1510 | sk_stream_for_retrans_queue(skb, sk) { | 1567 | sk_stream_for_retrans_queue_from(skb, sk) { |
| 1511 | cnt -= tcp_skb_pcount(skb); | 1568 | /* TODO: do this better */ |
| 1512 | if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) | 1569 | /* this is not the most efficient way to do this... */ |
| 1570 | tp->lost_skb_hint = skb; | ||
| 1571 | tp->lost_cnt_hint = cnt; | ||
| 1572 | cnt += tcp_skb_pcount(skb); | ||
| 1573 | if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq)) | ||
| 1513 | break; | 1574 | break; |
| 1514 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { | 1575 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { |
| 1515 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 1576 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
| 1516 | tp->lost_out += tcp_skb_pcount(skb); | 1577 | tp->lost_out += tcp_skb_pcount(skb); |
| 1578 | |||
| 1579 | /* clear xmit_retransmit_queue hints | ||
| 1580 | * if this is beyond hint */ | ||
| 1581 | if(tp->retransmit_skb_hint != NULL && | ||
| 1582 | before(TCP_SKB_CB(skb)->seq, | ||
| 1583 | TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) { | ||
| 1584 | |||
| 1585 | tp->retransmit_skb_hint = NULL; | ||
| 1586 | } | ||
| 1517 | } | 1587 | } |
| 1518 | } | 1588 | } |
| 1519 | tcp_sync_left_out(tp); | 1589 | tcp_sync_left_out(tp); |
| @@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) | |||
| 1540 | if (tcp_head_timedout(sk, tp)) { | 1610 | if (tcp_head_timedout(sk, tp)) { |
| 1541 | struct sk_buff *skb; | 1611 | struct sk_buff *skb; |
| 1542 | 1612 | ||
| 1543 | sk_stream_for_retrans_queue(skb, sk) { | 1613 | skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint |
| 1544 | if (tcp_skb_timedout(sk, skb) && | 1614 | : sk->sk_write_queue.next; |
| 1545 | !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { | 1615 | |
| 1616 | sk_stream_for_retrans_queue_from(skb, sk) { | ||
| 1617 | if (!tcp_skb_timedout(sk, skb)) | ||
| 1618 | break; | ||
| 1619 | |||
| 1620 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { | ||
| 1546 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 1621 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
| 1547 | tp->lost_out += tcp_skb_pcount(skb); | 1622 | tp->lost_out += tcp_skb_pcount(skb); |
| 1623 | |||
| 1624 | /* clear xmit_retrans hint */ | ||
| 1625 | if (tp->retransmit_skb_hint && | ||
| 1626 | before(TCP_SKB_CB(skb)->seq, | ||
| 1627 | TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) | ||
| 1628 | |||
| 1629 | tp->retransmit_skb_hint = NULL; | ||
| 1548 | } | 1630 | } |
| 1549 | } | 1631 | } |
| 1632 | |||
| 1633 | tp->scoreboard_skb_hint = skb; | ||
| 1634 | |||
| 1550 | tcp_sync_left_out(tp); | 1635 | tcp_sync_left_out(tp); |
| 1551 | } | 1636 | } |
| 1552 | } | 1637 | } |
| @@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) | |||
| 1626 | } | 1711 | } |
| 1627 | tcp_moderate_cwnd(tp); | 1712 | tcp_moderate_cwnd(tp); |
| 1628 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1713 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 1714 | |||
| 1715 | /* There is something screwy going on with the retrans hints after | ||
| 1716 | an undo */ | ||
| 1717 | clear_all_retrans_hints(tp); | ||
| 1629 | } | 1718 | } |
| 1630 | 1719 | ||
| 1631 | static inline int tcp_may_undo(struct tcp_sock *tp) | 1720 | static inline int tcp_may_undo(struct tcp_sock *tp) |
| @@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) | |||
| 1709 | sk_stream_for_retrans_queue(skb, sk) { | 1798 | sk_stream_for_retrans_queue(skb, sk) { |
| 1710 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; | 1799 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; |
| 1711 | } | 1800 | } |
| 1801 | |||
| 1802 | clear_all_retrans_hints(tp); | ||
| 1803 | |||
| 1712 | DBGUNDO(sk, tp, "partial loss"); | 1804 | DBGUNDO(sk, tp, "partial loss"); |
| 1713 | tp->lost_out = 0; | 1805 | tp->lost_out = 0; |
| 1714 | tp->left_out = tp->sacked_out; | 1806 | tp->left_out = tp->sacked_out; |
| @@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1908 | TCP_ECN_queue_cwr(tp); | 2000 | TCP_ECN_queue_cwr(tp); |
| 1909 | } | 2001 | } |
| 1910 | 2002 | ||
| 2003 | tp->bytes_acked = 0; | ||
| 1911 | tp->snd_cwnd_cnt = 0; | 2004 | tp->snd_cwnd_cnt = 0; |
| 1912 | tcp_set_ca_state(sk, TCP_CA_Recovery); | 2005 | tcp_set_ca_state(sk, TCP_CA_Recovery); |
| 1913 | } | 2006 | } |
| @@ -1919,9 +2012,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1919 | } | 2012 | } |
| 1920 | 2013 | ||
| 1921 | /* Read draft-ietf-tcplw-high-performance before mucking | 2014 | /* Read draft-ietf-tcplw-high-performance before mucking |
| 1922 | * with this code. (Superceeds RFC1323) | 2015 | * with this code. (Supersedes RFC1323) |
| 1923 | */ | 2016 | */ |
| 1924 | static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) | 2017 | static void tcp_ack_saw_tstamp(struct sock *sk, int flag) |
| 1925 | { | 2018 | { |
| 1926 | /* RTTM Rule: A TSecr value received in a segment is used to | 2019 | /* RTTM Rule: A TSecr value received in a segment is used to |
| 1927 | * update the averaged RTT measurement only if the segment | 2020 | * update the averaged RTT measurement only if the segment |
| @@ -1932,7 +2025,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) | |||
| 1932 | * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> | 2025 | * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> |
| 1933 | * | 2026 | * |
| 1934 | * Changed: reset backoff as soon as we see the first valid sample. | 2027 | * Changed: reset backoff as soon as we see the first valid sample. |
| 1935 | * If we do not, we get strongly overstimated rto. With timestamps | 2028 | * If we do not, we get strongly overestimated rto. With timestamps |
| 1936 | * samples are accepted even from very old segments: f.e., when rtt=1 | 2029 | * samples are accepted even from very old segments: f.e., when rtt=1 |
| 1937 | * increases to 8, we retransmit 5 times and after 8 seconds delayed | 2030 | * increases to 8, we retransmit 5 times and after 8 seconds delayed |
| 1938 | * answer arrives rto becomes 120 seconds! If at least one of segments | 2031 | * answer arrives rto becomes 120 seconds! If at least one of segments |
| @@ -1940,13 +2033,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) | |||
| 1940 | */ | 2033 | */ |
| 1941 | struct tcp_sock *tp = tcp_sk(sk); | 2034 | struct tcp_sock *tp = tcp_sk(sk); |
| 1942 | const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | 2035 | const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; |
| 1943 | tcp_rtt_estimator(sk, seq_rtt, usrtt); | 2036 | tcp_rtt_estimator(sk, seq_rtt); |
| 1944 | tcp_set_rto(sk); | 2037 | tcp_set_rto(sk); |
| 1945 | inet_csk(sk)->icsk_backoff = 0; | 2038 | inet_csk(sk)->icsk_backoff = 0; |
| 1946 | tcp_bound_rto(sk); | 2039 | tcp_bound_rto(sk); |
| 1947 | } | 2040 | } |
| 1948 | 2041 | ||
| 1949 | static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) | 2042 | static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) |
| 1950 | { | 2043 | { |
| 1951 | /* We don't have a timestamp. Can only use | 2044 | /* We don't have a timestamp. Can only use |
| 1952 | * packets that are not retransmitted to determine | 2045 | * packets that are not retransmitted to determine |
| @@ -1960,21 +2053,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag | |||
| 1960 | if (flag & FLAG_RETRANS_DATA_ACKED) | 2053 | if (flag & FLAG_RETRANS_DATA_ACKED) |
| 1961 | return; | 2054 | return; |
| 1962 | 2055 | ||
| 1963 | tcp_rtt_estimator(sk, seq_rtt, usrtt); | 2056 | tcp_rtt_estimator(sk, seq_rtt); |
| 1964 | tcp_set_rto(sk); | 2057 | tcp_set_rto(sk); |
| 1965 | inet_csk(sk)->icsk_backoff = 0; | 2058 | inet_csk(sk)->icsk_backoff = 0; |
| 1966 | tcp_bound_rto(sk); | 2059 | tcp_bound_rto(sk); |
| 1967 | } | 2060 | } |
| 1968 | 2061 | ||
| 1969 | static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, | 2062 | static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, |
| 1970 | const s32 seq_rtt, u32 *usrtt) | 2063 | const s32 seq_rtt) |
| 1971 | { | 2064 | { |
| 1972 | const struct tcp_sock *tp = tcp_sk(sk); | 2065 | const struct tcp_sock *tp = tcp_sk(sk); |
| 1973 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ | 2066 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ |
| 1974 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) | 2067 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
| 1975 | tcp_ack_saw_tstamp(sk, usrtt, flag); | 2068 | tcp_ack_saw_tstamp(sk, flag); |
| 1976 | else if (seq_rtt >= 0) | 2069 | else if (seq_rtt >= 0) |
| 1977 | tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); | 2070 | tcp_ack_no_tstamp(sk, seq_rtt, flag); |
| 1978 | } | 2071 | } |
| 1979 | 2072 | ||
| 1980 | static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | 2073 | static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, |
| @@ -2054,20 +2147,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, | |||
| 2054 | return acked; | 2147 | return acked; |
| 2055 | } | 2148 | } |
| 2056 | 2149 | ||
| 2150 | static inline u32 tcp_usrtt(const struct sk_buff *skb) | ||
| 2151 | { | ||
| 2152 | struct timeval tv, now; | ||
| 2153 | |||
| 2154 | do_gettimeofday(&now); | ||
| 2155 | skb_get_timestamp(skb, &tv); | ||
| 2156 | return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec); | ||
| 2157 | } | ||
| 2057 | 2158 | ||
| 2058 | /* Remove acknowledged frames from the retransmission queue. */ | 2159 | /* Remove acknowledged frames from the retransmission queue. */ |
| 2059 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) | 2160 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) |
| 2060 | { | 2161 | { |
| 2061 | struct tcp_sock *tp = tcp_sk(sk); | 2162 | struct tcp_sock *tp = tcp_sk(sk); |
| 2163 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 2062 | struct sk_buff *skb; | 2164 | struct sk_buff *skb; |
| 2063 | __u32 now = tcp_time_stamp; | 2165 | __u32 now = tcp_time_stamp; |
| 2064 | int acked = 0; | 2166 | int acked = 0; |
| 2065 | __s32 seq_rtt = -1; | 2167 | __s32 seq_rtt = -1; |
| 2066 | struct timeval usnow; | ||
| 2067 | u32 pkts_acked = 0; | 2168 | u32 pkts_acked = 0; |
| 2068 | 2169 | void (*rtt_sample)(struct sock *sk, u32 usrtt) | |
| 2069 | if (seq_usrtt) | 2170 | = icsk->icsk_ca_ops->rtt_sample; |
| 2070 | do_gettimeofday(&usnow); | ||
| 2071 | 2171 | ||
| 2072 | while ((skb = skb_peek(&sk->sk_write_queue)) && | 2172 | while ((skb = skb_peek(&sk->sk_write_queue)) && |
| 2073 | skb != sk->sk_send_head) { | 2173 | skb != sk->sk_send_head) { |
| @@ -2107,16 +2207,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt | |||
| 2107 | tp->retrans_out -= tcp_skb_pcount(skb); | 2207 | tp->retrans_out -= tcp_skb_pcount(skb); |
| 2108 | acked |= FLAG_RETRANS_DATA_ACKED; | 2208 | acked |= FLAG_RETRANS_DATA_ACKED; |
| 2109 | seq_rtt = -1; | 2209 | seq_rtt = -1; |
| 2110 | } else if (seq_rtt < 0) | 2210 | } else if (seq_rtt < 0) { |
| 2111 | seq_rtt = now - scb->when; | 2211 | seq_rtt = now - scb->when; |
| 2112 | if (seq_usrtt) { | 2212 | if (rtt_sample) |
| 2113 | struct timeval tv; | 2213 | (*rtt_sample)(sk, tcp_usrtt(skb)); |
| 2114 | |||
| 2115 | skb_get_timestamp(skb, &tv); | ||
| 2116 | *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000 | ||
| 2117 | + (usnow.tv_usec - tv.tv_usec); | ||
| 2118 | } | 2214 | } |
| 2119 | |||
| 2120 | if (sacked & TCPCB_SACKED_ACKED) | 2215 | if (sacked & TCPCB_SACKED_ACKED) |
| 2121 | tp->sacked_out -= tcp_skb_pcount(skb); | 2216 | tp->sacked_out -= tcp_skb_pcount(skb); |
| 2122 | if (sacked & TCPCB_LOST) | 2217 | if (sacked & TCPCB_LOST) |
| @@ -2126,17 +2221,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt | |||
| 2126 | !before(scb->end_seq, tp->snd_up)) | 2221 | !before(scb->end_seq, tp->snd_up)) |
| 2127 | tp->urg_mode = 0; | 2222 | tp->urg_mode = 0; |
| 2128 | } | 2223 | } |
| 2129 | } else if (seq_rtt < 0) | 2224 | } else if (seq_rtt < 0) { |
| 2130 | seq_rtt = now - scb->when; | 2225 | seq_rtt = now - scb->when; |
| 2226 | if (rtt_sample) | ||
| 2227 | (*rtt_sample)(sk, tcp_usrtt(skb)); | ||
| 2228 | } | ||
| 2131 | tcp_dec_pcount_approx(&tp->fackets_out, skb); | 2229 | tcp_dec_pcount_approx(&tp->fackets_out, skb); |
| 2132 | tcp_packets_out_dec(tp, skb); | 2230 | tcp_packets_out_dec(tp, skb); |
| 2133 | __skb_unlink(skb, &sk->sk_write_queue); | 2231 | __skb_unlink(skb, &sk->sk_write_queue); |
| 2134 | sk_stream_free_skb(sk, skb); | 2232 | sk_stream_free_skb(sk, skb); |
| 2233 | clear_all_retrans_hints(tp); | ||
| 2135 | } | 2234 | } |
| 2136 | 2235 | ||
| 2137 | if (acked&FLAG_ACKED) { | 2236 | if (acked&FLAG_ACKED) { |
| 2138 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2237 | tcp_ack_update_rtt(sk, acked, seq_rtt); |
| 2139 | tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); | ||
| 2140 | tcp_ack_packets_out(sk, tp); | 2238 | tcp_ack_packets_out(sk, tp); |
| 2141 | 2239 | ||
| 2142 | if (icsk->icsk_ca_ops->pkts_acked) | 2240 | if (icsk->icsk_ca_ops->pkts_acked) |
| @@ -2284,7 +2382,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) | |||
| 2284 | } | 2382 | } |
| 2285 | 2383 | ||
| 2286 | /* F-RTO affects on two new ACKs following RTO. | 2384 | /* F-RTO affects on two new ACKs following RTO. |
| 2287 | * At latest on third ACK the TCP behavor is back to normal. | 2385 | * At latest on third ACK the TCP behavior is back to normal. |
| 2288 | */ | 2386 | */ |
| 2289 | tp->frto_counter = (tp->frto_counter + 1) % 3; | 2387 | tp->frto_counter = (tp->frto_counter + 1) % 3; |
| 2290 | } | 2388 | } |
| @@ -2299,7 +2397,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2299 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 2397 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
| 2300 | u32 prior_in_flight; | 2398 | u32 prior_in_flight; |
| 2301 | s32 seq_rtt; | 2399 | s32 seq_rtt; |
| 2302 | s32 seq_usrtt = 0; | ||
| 2303 | int prior_packets; | 2400 | int prior_packets; |
| 2304 | 2401 | ||
| 2305 | /* If the ack is newer than sent or older than previous acks | 2402 | /* If the ack is newer than sent or older than previous acks |
| @@ -2311,6 +2408,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2311 | if (before(ack, prior_snd_una)) | 2408 | if (before(ack, prior_snd_una)) |
| 2312 | goto old_ack; | 2409 | goto old_ack; |
| 2313 | 2410 | ||
| 2411 | if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR) | ||
| 2412 | tp->bytes_acked += ack - prior_snd_una; | ||
| 2413 | |||
| 2314 | if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { | 2414 | if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { |
| 2315 | /* Window is constant, pure forward advance. | 2415 | /* Window is constant, pure forward advance. |
| 2316 | * No more checks are required. | 2416 | * No more checks are required. |
| @@ -2352,14 +2452,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2352 | prior_in_flight = tcp_packets_in_flight(tp); | 2452 | prior_in_flight = tcp_packets_in_flight(tp); |
| 2353 | 2453 | ||
| 2354 | /* See if we can take anything off of the retransmit queue. */ | 2454 | /* See if we can take anything off of the retransmit queue. */ |
| 2355 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt, | 2455 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt); |
| 2356 | icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL); | ||
| 2357 | 2456 | ||
| 2358 | if (tp->frto_counter) | 2457 | if (tp->frto_counter) |
| 2359 | tcp_process_frto(sk, prior_snd_una); | 2458 | tcp_process_frto(sk, prior_snd_una); |
| 2360 | 2459 | ||
| 2361 | if (tcp_ack_is_dubious(sk, flag)) { | 2460 | if (tcp_ack_is_dubious(sk, flag)) { |
| 2362 | /* Advanve CWND, if state allows this. */ | 2461 | /* Advance CWND, if state allows this. */ |
| 2363 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) | 2462 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) |
| 2364 | tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); | 2463 | tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); |
| 2365 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); | 2464 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); |
| @@ -3148,7 +3247,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, | |||
| 3148 | { | 3247 | { |
| 3149 | struct sk_buff *skb; | 3248 | struct sk_buff *skb; |
| 3150 | 3249 | ||
| 3151 | /* First, check that queue is collapsable and find | 3250 | /* First, check that queue is collapsible and find |
| 3152 | * the point where collapsing can be useful. */ | 3251 | * the point where collapsing can be useful. */ |
| 3153 | for (skb = head; skb != tail; ) { | 3252 | for (skb = head; skb != tail; ) { |
| 3154 | /* No new bits? It is possible on ofo queue. */ | 3253 | /* No new bits? It is possible on ofo queue. */ |
| @@ -3456,7 +3555,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk) | |||
| 3456 | 3555 | ||
| 3457 | /* | 3556 | /* |
| 3458 | * This routine is only called when we have urgent data | 3557 | * This routine is only called when we have urgent data |
| 3459 | * signalled. Its the 'slow' part of tcp_urg. It could be | 3558 | * signaled. Its the 'slow' part of tcp_urg. It could be |
| 3460 | * moved inline now as tcp_urg is only called from one | 3559 | * moved inline now as tcp_urg is only called from one |
| 3461 | * place. We handle URGent data wrong. We have to - as | 3560 | * place. We handle URGent data wrong. We have to - as |
| 3462 | * BSD still doesn't use the correction from RFC961. | 3561 | * BSD still doesn't use the correction from RFC961. |
| @@ -3501,7 +3600,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) | |||
| 3501 | * urgent. To do this requires some care. We cannot just ignore | 3600 | * urgent. To do this requires some care. We cannot just ignore |
| 3502 | * tp->copied_seq since we would read the last urgent byte again | 3601 | * tp->copied_seq since we would read the last urgent byte again |
| 3503 | * as data, nor can we alter copied_seq until this data arrives | 3602 | * as data, nor can we alter copied_seq until this data arrives |
| 3504 | * or we break the sematics of SIOCATMARK (and thus sockatmark()) | 3603 | * or we break the semantics of SIOCATMARK (and thus sockatmark()) |
| 3505 | * | 3604 | * |
| 3506 | * NOTE. Double Dutch. Rendering to plain English: author of comment | 3605 | * NOTE. Double Dutch. Rendering to plain English: author of comment |
| 3507 | * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); | 3606 | * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); |
| @@ -3646,7 +3745,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
| 3646 | tp->rx_opt.saw_tstamp = 0; | 3745 | tp->rx_opt.saw_tstamp = 0; |
| 3647 | 3746 | ||
| 3648 | /* pred_flags is 0xS?10 << 16 + snd_wnd | 3747 | /* pred_flags is 0xS?10 << 16 + snd_wnd |
| 3649 | * if header_predition is to be made | 3748 | * if header_prediction is to be made |
| 3650 | * 'S' will always be tp->tcp_header_len >> 2 | 3749 | * 'S' will always be tp->tcp_header_len >> 2 |
| 3651 | * '?' will be 0 for the fast path, otherwise pred_flags is 0 to | 3750 | * '?' will be 0 for the fast path, otherwise pred_flags is 0 to |
| 3652 | * turn it off (when there are holes in the receive | 3751 | * turn it off (when there are holes in the receive |
| @@ -4242,7 +4341,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4242 | */ | 4341 | */ |
| 4243 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 4342 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
| 4244 | !tp->srtt) | 4343 | !tp->srtt) |
| 4245 | tcp_ack_saw_tstamp(sk, NULL, 0); | 4344 | tcp_ack_saw_tstamp(sk, 0); |
| 4246 | 4345 | ||
| 4247 | if (tp->rx_opt.tstamp_ok) | 4346 | if (tp->rx_opt.tstamp_ok) |
| 4248 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 4347 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
| @@ -4372,6 +4471,7 @@ discard: | |||
| 4372 | 4471 | ||
| 4373 | EXPORT_SYMBOL(sysctl_tcp_ecn); | 4472 | EXPORT_SYMBOL(sysctl_tcp_ecn); |
| 4374 | EXPORT_SYMBOL(sysctl_tcp_reordering); | 4473 | EXPORT_SYMBOL(sysctl_tcp_reordering); |
| 4474 | EXPORT_SYMBOL(sysctl_tcp_abc); | ||
| 4375 | EXPORT_SYMBOL(tcp_parse_options); | 4475 | EXPORT_SYMBOL(tcp_parse_options); |
| 4376 | EXPORT_SYMBOL(tcp_rcv_established); | 4476 | EXPORT_SYMBOL(tcp_rcv_established); |
| 4377 | EXPORT_SYMBOL(tcp_rcv_state_process); | 4477 | EXPORT_SYMBOL(tcp_rcv_state_process); |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 634dabb558fd..4d5021e1929b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
| @@ -39,7 +39,7 @@ | |||
| 39 | * request_sock handling and moved | 39 | * request_sock handling and moved |
| 40 | * most of it into the af independent code. | 40 | * most of it into the af independent code. |
| 41 | * Added tail drop and some other bugfixes. | 41 | * Added tail drop and some other bugfixes. |
| 42 | * Added new listen sematics. | 42 | * Added new listen semantics. |
| 43 | * Mike McLagan : Routing by source | 43 | * Mike McLagan : Routing by source |
| 44 | * Juan Jose Ciarlante: ip_dynaddr bits | 44 | * Juan Jose Ciarlante: ip_dynaddr bits |
| 45 | * Andi Kleen: various fixes. | 45 | * Andi Kleen: various fixes. |
| @@ -1110,24 +1110,18 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) | |||
| 1110 | static int tcp_v4_checksum_init(struct sk_buff *skb) | 1110 | static int tcp_v4_checksum_init(struct sk_buff *skb) |
| 1111 | { | 1111 | { |
| 1112 | if (skb->ip_summed == CHECKSUM_HW) { | 1112 | if (skb->ip_summed == CHECKSUM_HW) { |
| 1113 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
| 1114 | if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, | 1113 | if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, |
| 1115 | skb->nh.iph->daddr, skb->csum)) | 1114 | skb->nh.iph->daddr, skb->csum)) { |
| 1115 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
| 1116 | return 0; | 1116 | return 0; |
| 1117 | 1117 | } | |
| 1118 | LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n"); | ||
| 1119 | skb->ip_summed = CHECKSUM_NONE; | ||
| 1120 | } | 1118 | } |
| 1119 | |||
| 1120 | skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr, | ||
| 1121 | skb->len, IPPROTO_TCP, 0); | ||
| 1122 | |||
| 1121 | if (skb->len <= 76) { | 1123 | if (skb->len <= 76) { |
| 1122 | if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, | 1124 | return __skb_checksum_complete(skb); |
| 1123 | skb->nh.iph->daddr, | ||
| 1124 | skb_checksum(skb, 0, skb->len, 0))) | ||
| 1125 | return -1; | ||
| 1126 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
| 1127 | } else { | ||
| 1128 | skb->csum = ~tcp_v4_check(skb->h.th, skb->len, | ||
| 1129 | skb->nh.iph->saddr, | ||
| 1130 | skb->nh.iph->daddr, 0); | ||
| 1131 | } | 1125 | } |
| 1132 | return 0; | 1126 | return 0; |
| 1133 | } | 1127 | } |
| @@ -1216,10 +1210,10 @@ int tcp_v4_rcv(struct sk_buff *skb) | |||
| 1216 | 1210 | ||
| 1217 | /* An explanation is required here, I think. | 1211 | /* An explanation is required here, I think. |
| 1218 | * Packet length and doff are validated by header prediction, | 1212 | * Packet length and doff are validated by header prediction, |
| 1219 | * provided case of th->doff==0 is elimineted. | 1213 | * provided case of th->doff==0 is eliminated. |
| 1220 | * So, we defer the checks. */ | 1214 | * So, we defer the checks. */ |
| 1221 | if ((skb->ip_summed != CHECKSUM_UNNECESSARY && | 1215 | if ((skb->ip_summed != CHECKSUM_UNNECESSARY && |
| 1222 | tcp_v4_checksum_init(skb) < 0)) | 1216 | tcp_v4_checksum_init(skb))) |
| 1223 | goto bad_packet; | 1217 | goto bad_packet; |
| 1224 | 1218 | ||
| 1225 | th = skb->h.th; | 1219 | th = skb->h.th; |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b1a63b2c6b4a..1b66a2ac4321 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
| @@ -158,7 +158,7 @@ kill_with_rst: | |||
| 158 | /* I am shamed, but failed to make it more elegant. | 158 | /* I am shamed, but failed to make it more elegant. |
| 159 | * Yes, it is direct reference to IP, which is impossible | 159 | * Yes, it is direct reference to IP, which is impossible |
| 160 | * to generalize to IPv6. Taking into account that IPv6 | 160 | * to generalize to IPv6. Taking into account that IPv6 |
| 161 | * do not undertsnad recycling in any case, it not | 161 | * do not understand recycling in any case, it not |
| 162 | * a big problem in practice. --ANK */ | 162 | * a big problem in practice. --ANK */ |
| 163 | if (tw->tw_family == AF_INET && | 163 | if (tw->tw_family == AF_INET && |
| 164 | tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && | 164 | tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && |
| @@ -194,7 +194,7 @@ kill_with_rst: | |||
| 194 | /* In window segment, it may be only reset or bare ack. */ | 194 | /* In window segment, it may be only reset or bare ack. */ |
| 195 | 195 | ||
| 196 | if (th->rst) { | 196 | if (th->rst) { |
| 197 | /* This is TIME_WAIT assasination, in two flavors. | 197 | /* This is TIME_WAIT assassination, in two flavors. |
| 198 | * Oh well... nobody has a sufficient solution to this | 198 | * Oh well... nobody has a sufficient solution to this |
| 199 | * protocol bug yet. | 199 | * protocol bug yet. |
| 200 | */ | 200 | */ |
| @@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
| 380 | */ | 380 | */ |
| 381 | newtp->snd_cwnd = 2; | 381 | newtp->snd_cwnd = 2; |
| 382 | newtp->snd_cwnd_cnt = 0; | 382 | newtp->snd_cwnd_cnt = 0; |
| 383 | newtp->bytes_acked = 0; | ||
| 383 | 384 | ||
| 384 | newtp->frto_counter = 0; | 385 | newtp->frto_counter = 0; |
| 385 | newtp->frto_highmark = 0; | 386 | newtp->frto_highmark = 0; |
| @@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | |||
| 550 | 551 | ||
| 551 | /* RFC793 page 36: "If the connection is in any non-synchronized state ... | 552 | /* RFC793 page 36: "If the connection is in any non-synchronized state ... |
| 552 | * and the incoming segment acknowledges something not yet | 553 | * and the incoming segment acknowledges something not yet |
| 553 | * sent (the segment carries an unaccaptable ACK) ... | 554 | * sent (the segment carries an unacceptable ACK) ... |
| 554 | * a reset is sent." | 555 | * a reset is sent." |
| 555 | * | 556 | * |
| 556 | * Invalid ACK: reset will be sent by listening socket | 557 | * Invalid ACK: reset will be sent by listening socket |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b907456a79f4..029c70dfb585 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
| @@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss | |||
| 436 | u16 flags; | 436 | u16 flags; |
| 437 | 437 | ||
| 438 | BUG_ON(len > skb->len); | 438 | BUG_ON(len > skb->len); |
| 439 | |||
| 440 | clear_all_retrans_hints(tp); | ||
| 439 | nsize = skb_headlen(skb) - len; | 441 | nsize = skb_headlen(skb) - len; |
| 440 | if (nsize < 0) | 442 | if (nsize < 0) |
| 441 | nsize = 0; | 443 | nsize = 0; |
| @@ -599,7 +601,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) | |||
| 599 | for TCP options, but includes only bare TCP header. | 601 | for TCP options, but includes only bare TCP header. |
| 600 | 602 | ||
| 601 | tp->rx_opt.mss_clamp is mss negotiated at connection setup. | 603 | tp->rx_opt.mss_clamp is mss negotiated at connection setup. |
| 602 | It is minumum of user_mss and mss received with SYN. | 604 | It is minimum of user_mss and mss received with SYN. |
| 603 | It also does not include TCP options. | 605 | It also does not include TCP options. |
| 604 | 606 | ||
| 605 | tp->pmtu_cookie is last pmtu, seen by this function. | 607 | tp->pmtu_cookie is last pmtu, seen by this function. |
| @@ -1171,7 +1173,7 @@ u32 __tcp_select_window(struct sock *sk) | |||
| 1171 | { | 1173 | { |
| 1172 | struct inet_connection_sock *icsk = inet_csk(sk); | 1174 | struct inet_connection_sock *icsk = inet_csk(sk); |
| 1173 | struct tcp_sock *tp = tcp_sk(sk); | 1175 | struct tcp_sock *tp = tcp_sk(sk); |
| 1174 | /* MSS for the peer's data. Previous verions used mss_clamp | 1176 | /* MSS for the peer's data. Previous versions used mss_clamp |
| 1175 | * here. I don't know if the value based on our guesses | 1177 | * here. I don't know if the value based on our guesses |
| 1176 | * of peer's MSS is better for the performance. It's more correct | 1178 | * of peer's MSS is better for the performance. It's more correct |
| 1177 | * but may be worse for the performance because of rcv_mss | 1179 | * but may be worse for the performance because of rcv_mss |
| @@ -1260,7 +1262,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m | |||
| 1260 | BUG_ON(tcp_skb_pcount(skb) != 1 || | 1262 | BUG_ON(tcp_skb_pcount(skb) != 1 || |
| 1261 | tcp_skb_pcount(next_skb) != 1); | 1263 | tcp_skb_pcount(next_skb) != 1); |
| 1262 | 1264 | ||
| 1263 | /* Ok. We will be able to collapse the packet. */ | 1265 | /* changing transmit queue under us so clear hints */ |
| 1266 | clear_all_retrans_hints(tp); | ||
| 1267 | |||
| 1268 | /* Ok. We will be able to collapse the packet. */ | ||
| 1264 | __skb_unlink(next_skb, &sk->sk_write_queue); | 1269 | __skb_unlink(next_skb, &sk->sk_write_queue); |
| 1265 | 1270 | ||
| 1266 | memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); | 1271 | memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); |
| @@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk) | |||
| 1330 | } | 1335 | } |
| 1331 | } | 1336 | } |
| 1332 | 1337 | ||
| 1338 | clear_all_retrans_hints(tp); | ||
| 1339 | |||
| 1333 | if (!lost) | 1340 | if (!lost) |
| 1334 | return; | 1341 | return; |
| 1335 | 1342 | ||
| @@ -1361,7 +1368,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 1361 | int err; | 1368 | int err; |
| 1362 | 1369 | ||
| 1363 | /* Do not sent more than we queued. 1/4 is reserved for possible | 1370 | /* Do not sent more than we queued. 1/4 is reserved for possible |
| 1364 | * copying overhead: frgagmentation, tunneling, mangling etc. | 1371 | * copying overhead: fragmentation, tunneling, mangling etc. |
| 1365 | */ | 1372 | */ |
| 1366 | if (atomic_read(&sk->sk_wmem_alloc) > | 1373 | if (atomic_read(&sk->sk_wmem_alloc) > |
| 1367 | min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) | 1374 | min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) |
| @@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
| 1468 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1475 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 1469 | struct tcp_sock *tp = tcp_sk(sk); | 1476 | struct tcp_sock *tp = tcp_sk(sk); |
| 1470 | struct sk_buff *skb; | 1477 | struct sk_buff *skb; |
| 1471 | int packet_cnt = tp->lost_out; | 1478 | int packet_cnt; |
| 1479 | |||
| 1480 | if (tp->retransmit_skb_hint) { | ||
| 1481 | skb = tp->retransmit_skb_hint; | ||
| 1482 | packet_cnt = tp->retransmit_cnt_hint; | ||
| 1483 | }else{ | ||
| 1484 | skb = sk->sk_write_queue.next; | ||
| 1485 | packet_cnt = 0; | ||
| 1486 | } | ||
| 1472 | 1487 | ||
| 1473 | /* First pass: retransmit lost packets. */ | 1488 | /* First pass: retransmit lost packets. */ |
| 1474 | if (packet_cnt) { | 1489 | if (tp->lost_out) { |
| 1475 | sk_stream_for_retrans_queue(skb, sk) { | 1490 | sk_stream_for_retrans_queue_from(skb, sk) { |
| 1476 | __u8 sacked = TCP_SKB_CB(skb)->sacked; | 1491 | __u8 sacked = TCP_SKB_CB(skb)->sacked; |
| 1477 | 1492 | ||
| 1493 | /* we could do better than to assign each time */ | ||
| 1494 | tp->retransmit_skb_hint = skb; | ||
| 1495 | tp->retransmit_cnt_hint = packet_cnt; | ||
| 1496 | |||
| 1478 | /* Assume this retransmit will generate | 1497 | /* Assume this retransmit will generate |
| 1479 | * only one packet for congestion window | 1498 | * only one packet for congestion window |
| 1480 | * calculation purposes. This works because | 1499 | * calculation purposes. This works because |
| @@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
| 1485 | if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) | 1504 | if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) |
| 1486 | return; | 1505 | return; |
| 1487 | 1506 | ||
| 1488 | if (sacked&TCPCB_LOST) { | 1507 | if (sacked & TCPCB_LOST) { |
| 1489 | if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { | 1508 | if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { |
| 1490 | if (tcp_retransmit_skb(sk, skb)) | 1509 | if (tcp_retransmit_skb(sk, skb)) { |
| 1510 | tp->retransmit_skb_hint = NULL; | ||
| 1491 | return; | 1511 | return; |
| 1512 | } | ||
| 1492 | if (icsk->icsk_ca_state != TCP_CA_Loss) | 1513 | if (icsk->icsk_ca_state != TCP_CA_Loss) |
| 1493 | NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); | 1514 | NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); |
| 1494 | else | 1515 | else |
| @@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
| 1501 | TCP_RTO_MAX); | 1522 | TCP_RTO_MAX); |
| 1502 | } | 1523 | } |
| 1503 | 1524 | ||
| 1504 | packet_cnt -= tcp_skb_pcount(skb); | 1525 | packet_cnt += tcp_skb_pcount(skb); |
| 1505 | if (packet_cnt <= 0) | 1526 | if (packet_cnt >= tp->lost_out) |
| 1506 | break; | 1527 | break; |
| 1507 | } | 1528 | } |
| 1508 | } | 1529 | } |
| @@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
| 1528 | if (tcp_may_send_now(sk, tp)) | 1549 | if (tcp_may_send_now(sk, tp)) |
| 1529 | return; | 1550 | return; |
| 1530 | 1551 | ||
| 1531 | packet_cnt = 0; | 1552 | if (tp->forward_skb_hint) { |
| 1553 | skb = tp->forward_skb_hint; | ||
| 1554 | packet_cnt = tp->forward_cnt_hint; | ||
| 1555 | } else{ | ||
| 1556 | skb = sk->sk_write_queue.next; | ||
| 1557 | packet_cnt = 0; | ||
| 1558 | } | ||
| 1559 | |||
| 1560 | sk_stream_for_retrans_queue_from(skb, sk) { | ||
| 1561 | tp->forward_cnt_hint = packet_cnt; | ||
| 1562 | tp->forward_skb_hint = skb; | ||
| 1532 | 1563 | ||
| 1533 | sk_stream_for_retrans_queue(skb, sk) { | ||
| 1534 | /* Similar to the retransmit loop above we | 1564 | /* Similar to the retransmit loop above we |
| 1535 | * can pretend that the retransmitted SKB | 1565 | * can pretend that the retransmitted SKB |
| 1536 | * we send out here will be composed of one | 1566 | * we send out here will be composed of one |
| @@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
| 1547 | continue; | 1577 | continue; |
| 1548 | 1578 | ||
| 1549 | /* Ok, retransmit it. */ | 1579 | /* Ok, retransmit it. */ |
| 1550 | if (tcp_retransmit_skb(sk, skb)) | 1580 | if (tcp_retransmit_skb(sk, skb)) { |
| 1581 | tp->forward_skb_hint = NULL; | ||
| 1551 | break; | 1582 | break; |
| 1583 | } | ||
| 1552 | 1584 | ||
| 1553 | if (skb == skb_peek(&sk->sk_write_queue)) | 1585 | if (skb == skb_peek(&sk->sk_write_queue)) |
| 1554 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 1586 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
| @@ -2058,3 +2090,4 @@ EXPORT_SYMBOL(tcp_connect); | |||
| 2058 | EXPORT_SYMBOL(tcp_make_synack); | 2090 | EXPORT_SYMBOL(tcp_make_synack); |
| 2059 | EXPORT_SYMBOL(tcp_simple_retransmit); | 2091 | EXPORT_SYMBOL(tcp_simple_retransmit); |
| 2060 | EXPORT_SYMBOL(tcp_sync_mss); | 2092 | EXPORT_SYMBOL(tcp_sync_mss); |
| 2093 | EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); | ||
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 327770bf5522..26d7486ee501 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c | |||
| @@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | |||
| 20 | u32 in_flight, int flag) | 20 | u32 in_flight, int flag) |
| 21 | { | 21 | { |
| 22 | struct tcp_sock *tp = tcp_sk(sk); | 22 | struct tcp_sock *tp = tcp_sk(sk); |
| 23 | if (in_flight < tp->snd_cwnd) | 23 | |
| 24 | if (!tcp_is_cwnd_limited(sk, in_flight)) | ||
| 24 | return; | 25 | return; |
| 25 | 26 | ||
| 26 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 27 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
| 27 | tp->snd_cwnd++; | 28 | tcp_slow_start(tp); |
| 28 | } else { | 29 | else { |
| 29 | tp->snd_cwnd_cnt++; | 30 | tp->snd_cwnd_cnt++; |
| 30 | if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ | 31 | if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ |
| 31 | tp->snd_cwnd++; | 32 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) |
| 33 | tp->snd_cwnd++; | ||
| 32 | tp->snd_cwnd_cnt = 0; | 34 | tp->snd_cwnd_cnt = 0; |
| 33 | } | 35 | } |
| 34 | } | 36 | } |
| 35 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
| 36 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
| 37 | } | 37 | } |
| 38 | 38 | ||
| 39 | static u32 tcp_scalable_ssthresh(struct sock *sk) | 39 | static u32 tcp_scalable_ssthresh(struct sock *sk) |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 415ee47ac1c5..e1880959614a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
| @@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk) | |||
| 58 | * to prevent DoS attacks. It is called when a retransmission timeout | 58 | * to prevent DoS attacks. It is called when a retransmission timeout |
| 59 | * or zero probe timeout occurs on orphaned socket. | 59 | * or zero probe timeout occurs on orphaned socket. |
| 60 | * | 60 | * |
| 61 | * Criterium is still not confirmed experimentally and may change. | 61 | * Criteria is still not confirmed experimentally and may change. |
| 62 | * We kill the socket, if: | 62 | * We kill the socket, if: |
| 63 | * 1. If number of orphaned sockets exceeds an administratively configured | 63 | * 1. If number of orphaned sockets exceeds an administratively configured |
| 64 | * limit. | 64 | * limit. |
| @@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk) | |||
| 132 | hole detection. :-( | 132 | hole detection. :-( |
| 133 | 133 | ||
| 134 | It is place to make it. It is not made. I do not want | 134 | It is place to make it. It is not made. I do not want |
| 135 | to make it. It is disguisting. It does not work in any | 135 | to make it. It is disgusting. It does not work in any |
| 136 | case. Let me to cite the same draft, which requires for | 136 | case. Let me to cite the same draft, which requires for |
| 137 | us to implement this: | 137 | us to implement this: |
| 138 | 138 | ||
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 93c5f92070f9..b7d296a8ac6d 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c | |||
| @@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, | |||
| 236 | /* We don't have enough RTT samples to do the Vegas | 236 | /* We don't have enough RTT samples to do the Vegas |
| 237 | * calculation, so we'll behave like Reno. | 237 | * calculation, so we'll behave like Reno. |
| 238 | */ | 238 | */ |
| 239 | if (tp->snd_cwnd > tp->snd_ssthresh) | 239 | tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); |
| 240 | tp->snd_cwnd++; | ||
| 241 | } else { | 240 | } else { |
| 242 | u32 rtt, target_cwnd, diff; | 241 | u32 rtt, target_cwnd, diff; |
| 243 | 242 | ||
| @@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, | |||
| 275 | */ | 274 | */ |
| 276 | diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; | 275 | diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; |
| 277 | 276 | ||
| 278 | if (tp->snd_cwnd < tp->snd_ssthresh) { | 277 | if (tp->snd_cwnd <= tp->snd_ssthresh) { |
| 279 | /* Slow start. */ | 278 | /* Slow start. */ |
| 280 | if (diff > gamma) { | 279 | if (diff > gamma) { |
| 281 | /* Going too fast. Time to slow down | 280 | /* Going too fast. Time to slow down |
| @@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, | |||
| 295 | V_PARAM_SHIFT)+1); | 294 | V_PARAM_SHIFT)+1); |
| 296 | 295 | ||
| 297 | } | 296 | } |
| 297 | tcp_slow_start(tp); | ||
| 298 | } else { | 298 | } else { |
| 299 | /* Congestion avoidance. */ | 299 | /* Congestion avoidance. */ |
| 300 | u32 next_snd_cwnd; | 300 | u32 next_snd_cwnd; |
| @@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, | |||
| 327 | else if (next_snd_cwnd < tp->snd_cwnd) | 327 | else if (next_snd_cwnd < tp->snd_cwnd) |
| 328 | tp->snd_cwnd--; | 328 | tp->snd_cwnd--; |
| 329 | } | 329 | } |
| 330 | } | ||
| 331 | 330 | ||
| 332 | /* Wipe the slate clean for the next RTT. */ | 331 | if (tp->snd_cwnd < 2) |
| 333 | vegas->cntRTT = 0; | 332 | tp->snd_cwnd = 2; |
| 334 | vegas->minRTT = 0x7fffffff; | 333 | else if (tp->snd_cwnd > tp->snd_cwnd_clamp) |
| 334 | tp->snd_cwnd = tp->snd_cwnd_clamp; | ||
| 335 | } | ||
| 335 | } | 336 | } |
| 336 | 337 | ||
| 337 | /* The following code is executed for every ack we receive, | 338 | /* Wipe the slate clean for the next RTT. */ |
| 338 | * except for conditions checked in should_advance_cwnd() | 339 | vegas->cntRTT = 0; |
| 339 | * before the call to tcp_cong_avoid(). Mainly this means that | 340 | vegas->minRTT = 0x7fffffff; |
| 340 | * we only execute this code if the ack actually acked some | ||
| 341 | * data. | ||
| 342 | */ | ||
| 343 | |||
| 344 | /* If we are in slow start, increase our cwnd in response to this ACK. | ||
| 345 | * (If we are not in slow start then we are in congestion avoidance, | ||
| 346 | * and adjust our congestion window only once per RTT. See the code | ||
| 347 | * above.) | ||
| 348 | */ | ||
| 349 | if (tp->snd_cwnd <= tp->snd_ssthresh) | ||
| 350 | tp->snd_cwnd++; | ||
| 351 | |||
| 352 | /* to keep cwnd from growing without bound */ | ||
| 353 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
| 354 | |||
| 355 | /* Make sure that we are never so timid as to reduce our cwnd below | ||
| 356 | * 2 MSS. | ||
| 357 | * | ||
| 358 | * Going below 2 MSS would risk huge delayed ACKs from our receiver. | ||
| 359 | */ | ||
| 360 | tp->snd_cwnd = max(tp->snd_cwnd, 2U); | ||
| 361 | } | 341 | } |
| 362 | 342 | ||
| 363 | /* Extract info for Tcp socket info provided via netlink. */ | 343 | /* Extract info for Tcp socket info provided via netlink. */ |
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e0bd1013cb0d..2422a5f7195d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c | |||
| @@ -761,7 +761,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) | |||
| 761 | 761 | ||
| 762 | static __inline__ int __udp_checksum_complete(struct sk_buff *skb) | 762 | static __inline__ int __udp_checksum_complete(struct sk_buff *skb) |
| 763 | { | 763 | { |
| 764 | return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); | 764 | return __skb_checksum_complete(skb); |
| 765 | } | 765 | } |
| 766 | 766 | ||
| 767 | static __inline__ int udp_checksum_complete(struct sk_buff *skb) | 767 | static __inline__ int udp_checksum_complete(struct sk_buff *skb) |
| @@ -1100,11 +1100,8 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, | |||
| 1100 | if (uh->check == 0) { | 1100 | if (uh->check == 0) { |
| 1101 | skb->ip_summed = CHECKSUM_UNNECESSARY; | 1101 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
| 1102 | } else if (skb->ip_summed == CHECKSUM_HW) { | 1102 | } else if (skb->ip_summed == CHECKSUM_HW) { |
| 1103 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
| 1104 | if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) | 1103 | if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) |
| 1105 | return 0; | 1104 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
| 1106 | LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n"); | ||
| 1107 | skb->ip_summed = CHECKSUM_NONE; | ||
| 1108 | } | 1105 | } |
| 1109 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) | 1106 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) |
| 1110 | skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); | 1107 | skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); |
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 23e540365a14..1bdf0fb8bf8a 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c | |||
| @@ -585,17 +585,16 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) | |||
| 585 | daddr = &skb->nh.ipv6h->daddr; | 585 | daddr = &skb->nh.ipv6h->daddr; |
| 586 | 586 | ||
| 587 | /* Perform checksum. */ | 587 | /* Perform checksum. */ |
| 588 | if (skb->ip_summed == CHECKSUM_HW) { | 588 | switch (skb->ip_summed) { |
| 589 | skb->ip_summed = CHECKSUM_UNNECESSARY; | 589 | case CHECKSUM_HW: |
| 590 | if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, | 590 | if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, |
| 591 | skb->csum)) { | 591 | skb->csum)) |
| 592 | LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n"); | 592 | break; |
| 593 | skb->ip_summed = CHECKSUM_NONE; | 593 | /* fall through */ |
| 594 | } | 594 | case CHECKSUM_NONE: |
| 595 | } | 595 | skb->csum = ~csum_ipv6_magic(saddr, daddr, skb->len, |
| 596 | if (skb->ip_summed == CHECKSUM_NONE) { | 596 | IPPROTO_ICMPV6, 0); |
| 597 | if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, | 597 | if (__skb_checksum_complete(skb)) { |
| 598 | skb_checksum(skb, 0, skb->len, 0))) { | ||
| 599 | LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", | 598 | LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", |
| 600 | NIP6(*saddr), NIP6(*daddr)); | 599 | NIP6(*saddr), NIP6(*daddr)); |
| 601 | goto discard_it; | 600 | goto discard_it; |
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 651c79b41eeb..8e9628f1c4c5 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c | |||
| @@ -298,13 +298,10 @@ void rawv6_err(struct sock *sk, struct sk_buff *skb, | |||
| 298 | static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) | 298 | static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) |
| 299 | { | 299 | { |
| 300 | if ((raw6_sk(sk)->checksum || sk->sk_filter) && | 300 | if ((raw6_sk(sk)->checksum || sk->sk_filter) && |
| 301 | skb->ip_summed != CHECKSUM_UNNECESSARY) { | 301 | skb_checksum_complete(skb)) { |
| 302 | if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { | 302 | /* FIXME: increment a raw6 drops counter here */ |
| 303 | /* FIXME: increment a raw6 drops counter here */ | 303 | kfree_skb(skb); |
| 304 | kfree_skb(skb); | 304 | return 0; |
| 305 | return 0; | ||
| 306 | } | ||
| 307 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
| 308 | } | 305 | } |
| 309 | 306 | ||
| 310 | /* Charge it to the socket. */ | 307 | /* Charge it to the socket. */ |
| @@ -337,32 +334,25 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) | |||
| 337 | if (!rp->checksum) | 334 | if (!rp->checksum) |
| 338 | skb->ip_summed = CHECKSUM_UNNECESSARY; | 335 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
| 339 | 336 | ||
| 340 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) { | 337 | if (skb->ip_summed == CHECKSUM_HW) { |
| 341 | if (skb->ip_summed == CHECKSUM_HW) { | 338 | skb_postpull_rcsum(skb, skb->nh.raw, |
| 342 | skb_postpull_rcsum(skb, skb->nh.raw, | 339 | skb->h.raw - skb->nh.raw); |
| 343 | skb->h.raw - skb->nh.raw); | 340 | if (!csum_ipv6_magic(&skb->nh.ipv6h->saddr, |
| 341 | &skb->nh.ipv6h->daddr, | ||
| 342 | skb->len, inet->num, skb->csum)) | ||
| 344 | skb->ip_summed = CHECKSUM_UNNECESSARY; | 343 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
| 345 | if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, | ||
| 346 | &skb->nh.ipv6h->daddr, | ||
| 347 | skb->len, inet->num, skb->csum)) { | ||
| 348 | LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n"); | ||
| 349 | skb->ip_summed = CHECKSUM_NONE; | ||
| 350 | } | ||
| 351 | } | ||
| 352 | if (skb->ip_summed == CHECKSUM_NONE) | ||
| 353 | skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr, | ||
| 354 | &skb->nh.ipv6h->daddr, | ||
| 355 | skb->len, inet->num, 0); | ||
| 356 | } | 344 | } |
| 345 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) | ||
| 346 | skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr, | ||
| 347 | &skb->nh.ipv6h->daddr, | ||
| 348 | skb->len, inet->num, 0); | ||
| 357 | 349 | ||
| 358 | if (inet->hdrincl) { | 350 | if (inet->hdrincl) { |
| 359 | if (skb->ip_summed != CHECKSUM_UNNECESSARY && | 351 | if (skb_checksum_complete(skb)) { |
| 360 | (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { | ||
| 361 | /* FIXME: increment a raw6 drops counter here */ | 352 | /* FIXME: increment a raw6 drops counter here */ |
| 362 | kfree_skb(skb); | 353 | kfree_skb(skb); |
| 363 | return 0; | 354 | return 0; |
| 364 | } | 355 | } |
| 365 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
| 366 | } | 356 | } |
| 367 | 357 | ||
| 368 | rawv6_rcv_skb(sk, skb); | 358 | rawv6_rcv_skb(sk, skb); |
| @@ -407,7 +397,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, | |||
| 407 | if (skb->ip_summed==CHECKSUM_UNNECESSARY) { | 397 | if (skb->ip_summed==CHECKSUM_UNNECESSARY) { |
| 408 | err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); | 398 | err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); |
| 409 | } else if (msg->msg_flags&MSG_TRUNC) { | 399 | } else if (msg->msg_flags&MSG_TRUNC) { |
| 410 | if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) | 400 | if (__skb_checksum_complete(skb)) |
| 411 | goto csum_copy_err; | 401 | goto csum_copy_err; |
| 412 | err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); | 402 | err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); |
| 413 | } else { | 403 | } else { |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d746d3b27efb..62c0e5bd931c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
| @@ -1401,20 +1401,18 @@ out: | |||
| 1401 | static int tcp_v6_checksum_init(struct sk_buff *skb) | 1401 | static int tcp_v6_checksum_init(struct sk_buff *skb) |
| 1402 | { | 1402 | { |
| 1403 | if (skb->ip_summed == CHECKSUM_HW) { | 1403 | if (skb->ip_summed == CHECKSUM_HW) { |
| 1404 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
| 1405 | if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, | 1404 | if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, |
| 1406 | &skb->nh.ipv6h->daddr,skb->csum)) | 1405 | &skb->nh.ipv6h->daddr,skb->csum)) { |
| 1406 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
| 1407 | return 0; | 1407 | return 0; |
| 1408 | LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n"); | 1408 | } |
| 1409 | } | 1409 | } |
| 1410 | |||
| 1411 | skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, | ||
| 1412 | &skb->nh.ipv6h->daddr, 0); | ||
| 1413 | |||
| 1410 | if (skb->len <= 76) { | 1414 | if (skb->len <= 76) { |
| 1411 | if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, | 1415 | return __skb_checksum_complete(skb); |
| 1412 | &skb->nh.ipv6h->daddr,skb_checksum(skb, 0, skb->len, 0))) | ||
| 1413 | return -1; | ||
| 1414 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
| 1415 | } else { | ||
| 1416 | skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, | ||
| 1417 | &skb->nh.ipv6h->daddr,0); | ||
| 1418 | } | 1416 | } |
| 1419 | return 0; | 1417 | return 0; |
| 1420 | } | 1418 | } |
| @@ -1575,7 +1573,7 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) | |||
| 1575 | goto discard_it; | 1573 | goto discard_it; |
| 1576 | 1574 | ||
| 1577 | if ((skb->ip_summed != CHECKSUM_UNNECESSARY && | 1575 | if ((skb->ip_summed != CHECKSUM_UNNECESSARY && |
| 1578 | tcp_v6_checksum_init(skb) < 0)) | 1576 | tcp_v6_checksum_init(skb))) |
| 1579 | goto bad_packet; | 1577 | goto bad_packet; |
| 1580 | 1578 | ||
| 1581 | th = skb->h.th; | 1579 | th = skb->h.th; |
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index bf9519341fd3..e671153b47b2 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c | |||
| @@ -248,7 +248,7 @@ try_again: | |||
| 248 | err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, | 248 | err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, |
| 249 | copied); | 249 | copied); |
| 250 | } else if (msg->msg_flags&MSG_TRUNC) { | 250 | } else if (msg->msg_flags&MSG_TRUNC) { |
| 251 | if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) | 251 | if (__skb_checksum_complete(skb)) |
| 252 | goto csum_copy_err; | 252 | goto csum_copy_err; |
| 253 | err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, | 253 | err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, |
| 254 | copied); | 254 | copied); |
| @@ -363,13 +363,10 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) | |||
| 363 | return -1; | 363 | return -1; |
| 364 | } | 364 | } |
| 365 | 365 | ||
| 366 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) { | 366 | if (skb_checksum_complete(skb)) { |
| 367 | if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { | 367 | UDP6_INC_STATS_BH(UDP_MIB_INERRORS); |
| 368 | UDP6_INC_STATS_BH(UDP_MIB_INERRORS); | 368 | kfree_skb(skb); |
| 369 | kfree_skb(skb); | 369 | return 0; |
| 370 | return 0; | ||
| 371 | } | ||
| 372 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
| 373 | } | 370 | } |
| 374 | 371 | ||
| 375 | if (sock_queue_rcv_skb(sk,skb)<0) { | 372 | if (sock_queue_rcv_skb(sk,skb)<0) { |
| @@ -491,13 +488,10 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) | |||
| 491 | uh = skb->h.uh; | 488 | uh = skb->h.uh; |
| 492 | } | 489 | } |
| 493 | 490 | ||
| 494 | if (skb->ip_summed==CHECKSUM_HW) { | 491 | if (skb->ip_summed == CHECKSUM_HW && |
| 492 | !csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) | ||
| 495 | skb->ip_summed = CHECKSUM_UNNECESSARY; | 493 | skb->ip_summed = CHECKSUM_UNNECESSARY; |
| 496 | if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) { | 494 | |
| 497 | LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n"); | ||
| 498 | skb->ip_summed = CHECKSUM_NONE; | ||
| 499 | } | ||
| 500 | } | ||
| 501 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) | 495 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) |
| 502 | skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0); | 496 | skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0); |
| 503 | 497 | ||
| @@ -521,8 +515,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) | |||
| 521 | if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) | 515 | if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) |
| 522 | goto discard; | 516 | goto discard; |
| 523 | 517 | ||
| 524 | if (skb->ip_summed != CHECKSUM_UNNECESSARY && | 518 | if (skb_checksum_complete(skb)) |
| 525 | (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) | ||
| 526 | goto discard; | 519 | goto discard; |
| 527 | UDP6_INC_STATS_BH(UDP_MIB_NOPORTS); | 520 | UDP6_INC_STATS_BH(UDP_MIB_NOPORTS); |
| 528 | 521 | ||
diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c index 122c086ee2db..dbe6105e83a5 100644 --- a/net/rxrpc/transport.c +++ b/net/rxrpc/transport.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/in.h> | 23 | #include <linux/in.h> |
| 24 | #include <linux/in6.h> | 24 | #include <linux/in6.h> |
| 25 | #include <linux/icmp.h> | 25 | #include <linux/icmp.h> |
| 26 | #include <linux/skbuff.h> | ||
| 26 | #include <net/sock.h> | 27 | #include <net/sock.h> |
| 27 | #include <net/ip.h> | 28 | #include <net/ip.h> |
| 28 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) | 29 | #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) |
| @@ -475,15 +476,11 @@ void rxrpc_trans_receive_packet(struct rxrpc_transport *trans) | |||
| 475 | 476 | ||
| 476 | /* we'll probably need to checksum it (didn't call | 477 | /* we'll probably need to checksum it (didn't call |
| 477 | * sock_recvmsg) */ | 478 | * sock_recvmsg) */ |
| 478 | if (pkt->ip_summed != CHECKSUM_UNNECESSARY) { | 479 | if (skb_checksum_complete(pkt)) { |
| 479 | if ((unsigned short) | 480 | kfree_skb(pkt); |
| 480 | csum_fold(skb_checksum(pkt, 0, pkt->len, | 481 | rxrpc_krxiod_queue_transport(trans); |
| 481 | pkt->csum))) { | 482 | _leave(" CSUM failed"); |
| 482 | kfree_skb(pkt); | 483 | return; |
| 483 | rxrpc_krxiod_queue_transport(trans); | ||
| 484 | _leave(" CSUM failed"); | ||
| 485 | return; | ||
| 486 | } | ||
| 487 | } | 484 | } |
| 488 | 485 | ||
| 489 | addr = pkt->nh.iph->saddr; | 486 | addr = pkt->nh.iph->saddr; |
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 8f97e90f36c8..eb330d4f66d6 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c | |||
| @@ -6,6 +6,9 @@ | |||
| 6 | * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> | 6 | * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> |
| 7 | */ | 7 | */ |
| 8 | 8 | ||
| 9 | #include <linux/compiler.h> | ||
| 10 | #include <linux/netdevice.h> | ||
| 11 | #include <linux/skbuff.h> | ||
| 9 | #include <linux/types.h> | 12 | #include <linux/types.h> |
| 10 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
| 11 | #include <linux/udp.h> | 14 | #include <linux/udp.h> |
| @@ -165,6 +168,8 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) | |||
| 165 | return -1; | 168 | return -1; |
| 166 | if ((unsigned short)csum_fold(desc.csum)) | 169 | if ((unsigned short)csum_fold(desc.csum)) |
| 167 | return -1; | 170 | return -1; |
| 171 | if (unlikely(skb->ip_summed == CHECKSUM_HW)) | ||
| 172 | netdev_rx_csum_fault(skb->dev); | ||
| 168 | return 0; | 173 | return 0; |
| 169 | no_checksum: | 174 | no_checksum: |
| 170 | if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0) | 175 | if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0) |
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index f16e7cdd6150..e50e7cf43737 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c | |||
| @@ -623,12 +623,9 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) | |||
| 623 | /* we can use it in-place */ | 623 | /* we can use it in-place */ |
| 624 | rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); | 624 | rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); |
| 625 | rqstp->rq_arg.head[0].iov_len = len; | 625 | rqstp->rq_arg.head[0].iov_len = len; |
| 626 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) { | 626 | if (skb_checksum_complete(skb)) { |
| 627 | if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { | 627 | skb_free_datagram(svsk->sk_sk, skb); |
| 628 | skb_free_datagram(svsk->sk_sk, skb); | 628 | return 0; |
| 629 | return 0; | ||
| 630 | } | ||
| 631 | skb->ip_summed = CHECKSUM_UNNECESSARY; | ||
| 632 | } | 629 | } |
| 633 | rqstp->rq_skbuff = skb; | 630 | rqstp->rq_skbuff = skb; |
| 634 | } | 631 | } |
