aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/core/datagram.c21
-rw-r--r--net/core/dev.c12
-rw-r--r--net/core/netpoll.c18
-rw-r--r--net/ipv4/icmp.c6
-rw-r--r--net/ipv4/igmp.c19
-rw-r--r--net/ipv4/ip_gre.c15
-rw-r--r--net/ipv4/netfilter/ip_conntrack_proto_icmp.c11
-rw-r--r--net/ipv4/sysctl_net_ipv4.c8
-rw-r--r--net/ipv4/tcp.c3
-rw-r--r--net/ipv4/tcp_bic.c12
-rw-r--r--net/ipv4/tcp_cong.c40
-rw-r--r--net/ipv4/tcp_highspeed.c11
-rw-r--r--net/ipv4/tcp_htcp.c13
-rw-r--r--net/ipv4/tcp_hybla.c6
-rw-r--r--net/ipv4/tcp_input.c288
-rw-r--r--net/ipv4/tcp_ipv4.c28
-rw-r--r--net/ipv4/tcp_minisocks.c7
-rw-r--r--net/ipv4/tcp_output.c61
-rw-r--r--net/ipv4/tcp_scalable.c14
-rw-r--r--net/ipv4/tcp_timer.c4
-rw-r--r--net/ipv4/tcp_vegas.c42
-rw-r--r--net/ipv4/udp.c7
-rw-r--r--net/ipv6/icmp.c21
-rw-r--r--net/ipv6/raw.c42
-rw-r--r--net/ipv6/tcp_ipv6.c20
-rw-r--r--net/ipv6/udp.c25
-rw-r--r--net/rxrpc/transport.c15
-rw-r--r--net/sunrpc/socklib.c5
-rw-r--r--net/sunrpc/svcsock.c9
29 files changed, 459 insertions, 324 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c
index d219435d086c..1bcfef51ac58 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -350,6 +350,20 @@ fault:
350 return -EFAULT; 350 return -EFAULT;
351} 351}
352 352
353unsigned int __skb_checksum_complete(struct sk_buff *skb)
354{
355 unsigned int sum;
356
357 sum = (u16)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
358 if (likely(!sum)) {
359 if (unlikely(skb->ip_summed == CHECKSUM_HW))
360 netdev_rx_csum_fault(skb->dev);
361 skb->ip_summed = CHECKSUM_UNNECESSARY;
362 }
363 return sum;
364}
365EXPORT_SYMBOL(__skb_checksum_complete);
366
353/** 367/**
354 * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. 368 * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec.
355 * @skb: skbuff 369 * @skb: skbuff
@@ -363,7 +377,7 @@ fault:
363 * -EFAULT - fault during copy. Beware, in this case iovec 377 * -EFAULT - fault during copy. Beware, in this case iovec
364 * can be modified! 378 * can be modified!
365 */ 379 */
366int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, 380int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
367 int hlen, struct iovec *iov) 381 int hlen, struct iovec *iov)
368{ 382{
369 unsigned int csum; 383 unsigned int csum;
@@ -376,8 +390,7 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb,
376 iov++; 390 iov++;
377 391
378 if (iov->iov_len < chunk) { 392 if (iov->iov_len < chunk) {
379 if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk + hlen, 393 if (__skb_checksum_complete(skb))
380 skb->csum)))
381 goto csum_error; 394 goto csum_error;
382 if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) 395 if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
383 goto fault; 396 goto fault;
@@ -388,6 +401,8 @@ int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb,
388 goto fault; 401 goto fault;
389 if ((unsigned short)csum_fold(csum)) 402 if ((unsigned short)csum_fold(csum))
390 goto csum_error; 403 goto csum_error;
404 if (unlikely(skb->ip_summed == CHECKSUM_HW))
405 netdev_rx_csum_fault(skb->dev);
391 iov->iov_len -= chunk; 406 iov->iov_len -= chunk;
392 iov->iov_base += chunk; 407 iov->iov_base += chunk;
393 } 408 }
diff --git a/net/core/dev.c b/net/core/dev.c
index 8d1541595277..0b48e294aafe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1108,6 +1108,18 @@ out:
1108 return ret; 1108 return ret;
1109} 1109}
1110 1110
1111/* Take action when hardware reception checksum errors are detected. */
1112#ifdef CONFIG_BUG
1113void netdev_rx_csum_fault(struct net_device *dev)
1114{
1115 if (net_ratelimit()) {
1116 printk(KERN_ERR "%s: hw csum failure.\n", dev->name);
1117 dump_stack();
1118 }
1119}
1120EXPORT_SYMBOL(netdev_rx_csum_fault);
1121#endif
1122
1111#ifdef CONFIG_HIGHMEM 1123#ifdef CONFIG_HIGHMEM
1112/* Actually, we should eliminate this check as soon as we know, that: 1124/* Actually, we should eliminate this check as soon as we know, that:
1113 * 1. IOMMU is present and allows to map all the memory. 1125 * 1. IOMMU is present and allows to map all the memory.
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 802fe11efad0..49424a42a2c0 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -101,16 +101,20 @@ void netpoll_queue(struct sk_buff *skb)
101static int checksum_udp(struct sk_buff *skb, struct udphdr *uh, 101static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
102 unsigned short ulen, u32 saddr, u32 daddr) 102 unsigned short ulen, u32 saddr, u32 daddr)
103{ 103{
104 if (uh->check == 0) 104 unsigned int psum;
105
106 if (uh->check == 0 || skb->ip_summed == CHECKSUM_UNNECESSARY)
105 return 0; 107 return 0;
106 108
107 if (skb->ip_summed == CHECKSUM_HW) 109 psum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
108 return csum_tcpudp_magic( 110
109 saddr, daddr, ulen, IPPROTO_UDP, skb->csum); 111 if (skb->ip_summed == CHECKSUM_HW &&
112 !(u16)csum_fold(csum_add(psum, skb->csum)))
113 return 0;
110 114
111 skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); 115 skb->csum = psum;
112 116
113 return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); 117 return __skb_checksum_complete(skb);
114} 118}
115 119
116/* 120/*
@@ -489,7 +493,7 @@ int __netpoll_rx(struct sk_buff *skb)
489 493
490 if (ulen != len) 494 if (ulen != len)
491 goto out; 495 goto out;
492 if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0) 496 if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr))
493 goto out; 497 goto out;
494 if (np->local_ip && np->local_ip != ntohl(iph->daddr)) 498 if (np->local_ip && np->local_ip != ntohl(iph->daddr))
495 goto out; 499 goto out;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 175e093ec564..e3eceecd0496 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -934,11 +934,11 @@ int icmp_rcv(struct sk_buff *skb)
934 case CHECKSUM_HW: 934 case CHECKSUM_HW:
935 if (!(u16)csum_fold(skb->csum)) 935 if (!(u16)csum_fold(skb->csum))
936 break; 936 break;
937 LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n"); 937 /* fall through */
938 case CHECKSUM_NONE: 938 case CHECKSUM_NONE:
939 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) 939 skb->csum = 0;
940 if (__skb_checksum_complete(skb))
940 goto error; 941 goto error;
941 default:;
942 } 942 }
943 943
944 if (!pskb_pull(skb, sizeof(struct icmphdr))) 944 if (!pskb_pull(skb, sizeof(struct icmphdr)))
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index c6247fc84060..c04607b49212 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -872,11 +872,18 @@ int igmp_rcv(struct sk_buff *skb)
872 return 0; 872 return 0;
873 } 873 }
874 874
875 if (!pskb_may_pull(skb, sizeof(struct igmphdr)) || 875 if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
876 (u16)csum_fold(skb_checksum(skb, 0, len, 0))) { 876 goto drop;
877 in_dev_put(in_dev); 877
878 kfree_skb(skb); 878 switch (skb->ip_summed) {
879 return 0; 879 case CHECKSUM_HW:
880 if (!(u16)csum_fold(skb->csum))
881 break;
882 /* fall through */
883 case CHECKSUM_NONE:
884 skb->csum = 0;
885 if (__skb_checksum_complete(skb))
886 goto drop;
880 } 887 }
881 888
882 ih = skb->h.igmph; 889 ih = skb->h.igmph;
@@ -906,6 +913,8 @@ int igmp_rcv(struct sk_buff *skb)
906 default: 913 default:
907 NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); 914 NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type);
908 } 915 }
916
917drop:
909 in_dev_put(in_dev); 918 in_dev_put(in_dev);
910 kfree_skb(skb); 919 kfree_skb(skb);
911 return 0; 920 return 0;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 896ce3f8f53a..4e9c74b54b15 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -577,15 +577,16 @@ static int ipgre_rcv(struct sk_buff *skb)
577 goto drop_nolock; 577 goto drop_nolock;
578 578
579 if (flags&GRE_CSUM) { 579 if (flags&GRE_CSUM) {
580 if (skb->ip_summed == CHECKSUM_HW) { 580 switch (skb->ip_summed) {
581 case CHECKSUM_HW:
581 csum = (u16)csum_fold(skb->csum); 582 csum = (u16)csum_fold(skb->csum);
582 if (csum) 583 if (!csum)
583 skb->ip_summed = CHECKSUM_NONE; 584 break;
584 } 585 /* fall through */
585 if (skb->ip_summed == CHECKSUM_NONE) { 586 case CHECKSUM_NONE:
586 skb->csum = skb_checksum(skb, 0, skb->len, 0); 587 skb->csum = 0;
588 csum = __skb_checksum_complete(skb);
587 skb->ip_summed = CHECKSUM_HW; 589 skb->ip_summed = CHECKSUM_HW;
588 csum = (u16)csum_fold(skb->csum);
589 } 590 }
590 offset += 4; 591 offset += 4;
591 } 592 }
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 5198f3a1e2cd..e4d6b268e8c4 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -13,6 +13,7 @@
13#include <linux/in.h> 13#include <linux/in.h>
14#include <linux/icmp.h> 14#include <linux/icmp.h>
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/skbuff.h>
16#include <net/ip.h> 17#include <net/ip.h>
17#include <net/checksum.h> 18#include <net/checksum.h>
18#include <linux/netfilter.h> 19#include <linux/netfilter.h>
@@ -230,19 +231,15 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
230 case CHECKSUM_HW: 231 case CHECKSUM_HW:
231 if (!(u16)csum_fold(skb->csum)) 232 if (!(u16)csum_fold(skb->csum))
232 break; 233 break;
233 if (LOG_INVALID(IPPROTO_ICMP)) 234 /* fall through */
234 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
235 "ip_ct_icmp: bad HW ICMP checksum ");
236 return -NF_ACCEPT;
237 case CHECKSUM_NONE: 235 case CHECKSUM_NONE:
238 if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { 236 skb->csum = 0;
237 if (__skb_checksum_complete(skb)) {
239 if (LOG_INVALID(IPPROTO_ICMP)) 238 if (LOG_INVALID(IPPROTO_ICMP))
240 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, 239 nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL,
241 "ip_ct_icmp: bad ICMP checksum "); 240 "ip_ct_icmp: bad ICMP checksum ");
242 return -NF_ACCEPT; 241 return -NF_ACCEPT;
243 } 242 }
244 default:
245 break;
246 } 243 }
247 244
248checksum_skipped: 245checksum_skipped:
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 652685623519..01444a02b48b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -645,6 +645,14 @@ ctl_table ipv4_table[] = {
645 .proc_handler = &proc_tcp_congestion_control, 645 .proc_handler = &proc_tcp_congestion_control,
646 .strategy = &sysctl_tcp_congestion_control, 646 .strategy = &sysctl_tcp_congestion_control,
647 }, 647 },
648 {
649 .ctl_name = NET_TCP_ABC,
650 .procname = "tcp_abc",
651 .data = &sysctl_tcp_abc,
652 .maxlen = sizeof(int),
653 .mode = 0644,
654 .proc_handler = &proc_dointvec,
655 },
648 656
649 { .ctl_name = 0 } 657 { .ctl_name = 0 }
650}; 658};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 72b7c22e1ea5..9ac7a4f46bd8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags)
1640 } else if (tcp_need_reset(old_state) || 1640 } else if (tcp_need_reset(old_state) ||
1641 (tp->snd_nxt != tp->write_seq && 1641 (tp->snd_nxt != tp->write_seq &&
1642 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { 1642 (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1643 /* The last check adjusts for discrepance of Linux wrt. RFC 1643 /* The last check adjusts for discrepancy of Linux wrt. RFC
1644 * states 1644 * states
1645 */ 1645 */
1646 tcp_send_active_reset(sk, gfp_any()); 1646 tcp_send_active_reset(sk, gfp_any());
@@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags)
1669 tp->packets_out = 0; 1669 tp->packets_out = 0;
1670 tp->snd_ssthresh = 0x7fffffff; 1670 tp->snd_ssthresh = 0x7fffffff;
1671 tp->snd_cwnd_cnt = 0; 1671 tp->snd_cwnd_cnt = 0;
1672 tp->bytes_acked = 0;
1672 tcp_set_ca_state(sk, TCP_CA_Open); 1673 tcp_set_ca_state(sk, TCP_CA_Open);
1673 tcp_clear_retrans(tp); 1674 tcp_clear_retrans(tp);
1674 inet_csk_delack_init(sk); 1675 inet_csk_delack_init(sk);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index ae35e0609047..1d0cd86621b1 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -217,17 +217,15 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack,
217 217
218 bictcp_low_utilization(sk, data_acked); 218 bictcp_low_utilization(sk, data_acked);
219 219
220 if (in_flight < tp->snd_cwnd) 220 if (!tcp_is_cwnd_limited(sk, in_flight))
221 return; 221 return;
222 222
223 if (tp->snd_cwnd <= tp->snd_ssthresh) { 223 if (tp->snd_cwnd <= tp->snd_ssthresh)
224 /* In "safe" area, increase. */ 224 tcp_slow_start(tp);
225 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 225 else {
226 tp->snd_cwnd++;
227 } else {
228 bictcp_update(ca, tp->snd_cwnd); 226 bictcp_update(ca, tp->snd_cwnd);
229 227
230 /* In dangerous area, increase slowly. 228 /* In dangerous area, increase slowly.
231 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd 229 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
232 */ 230 */
233 if (tp->snd_cwnd_cnt >= ca->cnt) { 231 if (tp->snd_cwnd_cnt >= ca->cnt) {
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index bbf2d6624e89..c7cc62c8dc12 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -186,24 +186,32 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight,
186{ 186{
187 struct tcp_sock *tp = tcp_sk(sk); 187 struct tcp_sock *tp = tcp_sk(sk);
188 188
189 if (in_flight < tp->snd_cwnd) 189 if (!tcp_is_cwnd_limited(sk, in_flight))
190 return; 190 return;
191 191
192 if (tp->snd_cwnd <= tp->snd_ssthresh) { 192 /* In "safe" area, increase. */
193 /* In "safe" area, increase. */ 193 if (tp->snd_cwnd <= tp->snd_ssthresh)
194 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 194 tcp_slow_start(tp);
195 tp->snd_cwnd++; 195
196 } else { 196 /* In dangerous area, increase slowly. */
197 /* In dangerous area, increase slowly. 197 else if (sysctl_tcp_abc) {
198 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd 198 /* RFC3465: Apppriate Byte Count
199 */ 199 * increase once for each full cwnd acked
200 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { 200 */
201 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 201 if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
202 tp->snd_cwnd++; 202 tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
203 tp->snd_cwnd_cnt = 0; 203 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
204 } else 204 tp->snd_cwnd++;
205 tp->snd_cwnd_cnt++; 205 }
206 } 206 } else {
207 /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */
208 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
209 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
210 tp->snd_cwnd++;
211 tp->snd_cwnd_cnt = 0;
212 } else
213 tp->snd_cwnd_cnt++;
214 }
207} 215}
208EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); 216EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
209 217
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index 6acc04bde080..82b3c189bd7d 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk)
111} 111}
112 112
113static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, 113static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt,
114 u32 in_flight, int good) 114 u32 in_flight, u32 pkts_acked)
115{ 115{
116 struct tcp_sock *tp = tcp_sk(sk); 116 struct tcp_sock *tp = tcp_sk(sk);
117 struct hstcp *ca = inet_csk_ca(sk); 117 struct hstcp *ca = inet_csk_ca(sk);
118 118
119 if (in_flight < tp->snd_cwnd) 119 if (!tcp_is_cwnd_limited(sk, in_flight))
120 return; 120 return;
121 121
122 if (tp->snd_cwnd <= tp->snd_ssthresh) { 122 if (tp->snd_cwnd <= tp->snd_ssthresh)
123 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 123 tcp_slow_start(tp);
124 tp->snd_cwnd++; 124 else {
125 } else {
126 /* Update AIMD parameters */ 125 /* Update AIMD parameters */
127 if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { 126 if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
128 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && 127 while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index e47b37984e95..3284cfb993e6 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
207 struct tcp_sock *tp = tcp_sk(sk); 207 struct tcp_sock *tp = tcp_sk(sk);
208 struct htcp *ca = inet_csk_ca(sk); 208 struct htcp *ca = inet_csk_ca(sk);
209 209
210 if (in_flight < tp->snd_cwnd) 210 if (!tcp_is_cwnd_limited(sk, in_flight))
211 return; 211 return;
212 212
213 if (tp->snd_cwnd <= tp->snd_ssthresh) { 213 if (tp->snd_cwnd <= tp->snd_ssthresh)
214 /* In "safe" area, increase. */ 214 tcp_slow_start(tp);
215 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 215 else {
216 tp->snd_cwnd++; 216
217 } else {
218 measure_rtt(sk); 217 measure_rtt(sk);
219 218
220 /* keep track of number of round-trip times since last backoff event */ 219 /* keep track of number of round-trip times since last backoff event */
@@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
224 htcp_alpha_update(ca); 223 htcp_alpha_update(ca);
225 } 224 }
226 225
227 /* In dangerous area, increase slowly. 226 /* In dangerous area, increase slowly.
228 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd 227 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
229 */ 228 */
230 if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { 229 if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) {
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 77add63623df..40dbb3877510 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
100 ca->minrtt = tp->srtt; 100 ca->minrtt = tp->srtt;
101 } 101 }
102 102
103 if (!tcp_is_cwnd_limited(sk, in_flight))
104 return;
105
103 if (!ca->hybla_en) 106 if (!ca->hybla_en)
104 return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); 107 return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag);
105 108
106 if (in_flight < tp->snd_cwnd)
107 return;
108
109 if (ca->rho == 0) 109 if (ca->rho == 0)
110 hybla_recalc_param(sk); 110 hybla_recalc_param(sk);
111 111
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3e98b57578dc..40a26b7157b4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -42,7 +42,7 @@
42 * Andi Kleen : Moved open_request checking here 42 * Andi Kleen : Moved open_request checking here
43 * and process RSTs for open_requests. 43 * and process RSTs for open_requests.
44 * Andi Kleen : Better prune_queue, and other fixes. 44 * Andi Kleen : Better prune_queue, and other fixes.
45 * Andrey Savochkin: Fix RTT measurements in the presnce of 45 * Andrey Savochkin: Fix RTT measurements in the presence of
46 * timestamps. 46 * timestamps.
47 * Andrey Savochkin: Check sequence numbers correctly when 47 * Andrey Savochkin: Check sequence numbers correctly when
48 * removing SACKs due to in sequence incoming 48 * removing SACKs due to in sequence incoming
@@ -89,6 +89,7 @@ int sysctl_tcp_frto;
89int sysctl_tcp_nometrics_save; 89int sysctl_tcp_nometrics_save;
90 90
91int sysctl_tcp_moderate_rcvbuf = 1; 91int sysctl_tcp_moderate_rcvbuf = 1;
92int sysctl_tcp_abc = 1;
92 93
93#define FLAG_DATA 0x01 /* Incoming frame contained data. */ 94#define FLAG_DATA 0x01 /* Incoming frame contained data. */
94#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 95#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk)
223 * of receiver window. Check #2. 224 * of receiver window. Check #2.
224 * 225 *
225 * The scheme does not work when sender sends good segments opening 226 * The scheme does not work when sender sends good segments opening
226 * window and then starts to feed us spagetti. But it should work 227 * window and then starts to feed us spaghetti. But it should work
227 * in common situations. Otherwise, we have to rely on queue collapsing. 228 * in common situations. Otherwise, we have to rely on queue collapsing.
228 */ 229 */
229 230
@@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp,
233{ 234{
234 /* Optimize this! */ 235 /* Optimize this! */
235 int truesize = tcp_win_from_space(skb->truesize)/2; 236 int truesize = tcp_win_from_space(skb->truesize)/2;
236 int window = tcp_full_space(sk)/2; 237 int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2;
237 238
238 while (tp->rcv_ssthresh <= window) { 239 while (tp->rcv_ssthresh <= window) {
239 if (truesize <= skb->len) 240 if (truesize <= skb->len)
@@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
277 int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); 278 int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
278 279
279 /* Try to select rcvbuf so that 4 mss-sized segments 280 /* Try to select rcvbuf so that 4 mss-sized segments
280 * will fit to window and correspoding skbs will fit to our rcvbuf. 281 * will fit to window and corresponding skbs will fit to our rcvbuf.
281 * (was 3; 4 is minimum to allow fast retransmit to work.) 282 * (was 3; 4 is minimum to allow fast retransmit to work.)
282 */ 283 */
283 while (tcp_win_from_space(rcvmem) < tp->advmss) 284 while (tcp_win_from_space(rcvmem) < tp->advmss)
@@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
286 sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); 287 sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
287} 288}
288 289
289/* 4. Try to fixup all. It is made iimediately after connection enters 290/* 4. Try to fixup all. It is made immediately after connection enters
290 * established state. 291 * established state.
291 */ 292 */
292static void tcp_init_buffer_space(struct sock *sk) 293static void tcp_init_buffer_space(struct sock *sk)
@@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk)
326static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) 327static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
327{ 328{
328 struct inet_connection_sock *icsk = inet_csk(sk); 329 struct inet_connection_sock *icsk = inet_csk(sk);
329 struct sk_buff *skb;
330 unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
331 int ofo_win = 0;
332 330
333 icsk->icsk_ack.quick = 0; 331 icsk->icsk_ack.quick = 0;
334 332
335 skb_queue_walk(&tp->out_of_order_queue, skb) { 333 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
336 ofo_win += skb->len; 334 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
337 } 335 !tcp_memory_pressure &&
338 336 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
339 /* If overcommit is due to out of order segments, 337 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
340 * do not clamp window. Try to expand rcvbuf instead. 338 sysctl_tcp_rmem[2]);
341 */
342 if (ofo_win) {
343 if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
344 !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
345 !tcp_memory_pressure &&
346 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
347 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
348 sysctl_tcp_rmem[2]);
349 } 339 }
350 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { 340 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
351 app_win += ofo_win;
352 if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
353 app_win >>= 1;
354 if (app_win > icsk->icsk_ack.rcv_mss)
355 app_win -= icsk->icsk_ack.rcv_mss;
356 app_win = max(app_win, 2U*tp->advmss);
357
358 tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); 341 tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
359 }
360} 342}
361 343
362/* Receiver "autotuning" code. 344/* Receiver "autotuning" code.
@@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
385 * are stalled on filesystem I/O. 367 * are stalled on filesystem I/O.
386 * 368 *
387 * Also, since we are only going for a minimum in the 369 * Also, since we are only going for a minimum in the
388 * non-timestamp case, we do not smoothe things out 370 * non-timestamp case, we do not smoother things out
389 * else with timestamps disabled convergance takes too 371 * else with timestamps disabled convergence takes too
390 * long. 372 * long.
391 */ 373 */
392 if (!win_dep) { 374 if (!win_dep) {
@@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
395 } else if (m < new_sample) 377 } else if (m < new_sample)
396 new_sample = m << 3; 378 new_sample = m << 3;
397 } else { 379 } else {
398 /* No previous mesaure. */ 380 /* No previous measure. */
399 new_sample = m << 3; 381 new_sample = m << 3;
400 } 382 }
401 383
@@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
524 if (icsk->icsk_ack.ato > icsk->icsk_rto) 506 if (icsk->icsk_ack.ato > icsk->icsk_rto)
525 icsk->icsk_ack.ato = icsk->icsk_rto; 507 icsk->icsk_ack.ato = icsk->icsk_rto;
526 } else if (m > icsk->icsk_rto) { 508 } else if (m > icsk->icsk_rto) {
527 /* Too long gap. Apparently sender falled to 509 /* Too long gap. Apparently sender failed to
528 * restart window, so that we send ACKs quickly. 510 * restart window, so that we send ACKs quickly.
529 */ 511 */
530 tcp_incr_quickack(sk); 512 tcp_incr_quickack(sk);
@@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
548 * To save cycles in the RFC 1323 implementation it was better to break 530 * To save cycles in the RFC 1323 implementation it was better to break
549 * it up into three procedures. -- erics 531 * it up into three procedures. -- erics
550 */ 532 */
551static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) 533static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
552{ 534{
553 struct tcp_sock *tp = tcp_sk(sk); 535 struct tcp_sock *tp = tcp_sk(sk);
554 const struct inet_connection_sock *icsk = inet_csk(sk);
555 long m = mrtt; /* RTT */ 536 long m = mrtt; /* RTT */
556 537
557 /* The following amusing code comes from Jacobson's 538 /* The following amusing code comes from Jacobson's
@@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
565 * 546 *
566 * Funny. This algorithm seems to be very broken. 547 * Funny. This algorithm seems to be very broken.
567 * These formulae increase RTO, when it should be decreased, increase 548 * These formulae increase RTO, when it should be decreased, increase
568 * too slowly, when it should be incresed fastly, decrease too fastly 549 * too slowly, when it should be increased fastly, decrease too fastly
569 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely 550 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
570 * does not matter how to _calculate_ it. Seems, it was trap 551 * does not matter how to _calculate_ it. Seems, it was trap
571 * that VJ failed to avoid. 8) 552 * that VJ failed to avoid. 8)
@@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt)
610 tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); 591 tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
611 tp->rtt_seq = tp->snd_nxt; 592 tp->rtt_seq = tp->snd_nxt;
612 } 593 }
613
614 if (icsk->icsk_ca_ops->rtt_sample)
615 icsk->icsk_ca_ops->rtt_sample(sk, *usrtt);
616} 594}
617 595
618/* Calculate rto without backoff. This is the second half of Van Jacobson's 596/* Calculate rto without backoff. This is the second half of Van Jacobson's
@@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk)
629 * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ 607 * at least by solaris and freebsd. "Erratic ACKs" has _nothing_
630 * to do with delayed acks, because at cwnd>2 true delack timeout 608 * to do with delayed acks, because at cwnd>2 true delack timeout
631 * is invisible. Actually, Linux-2.4 also generates erratic 609 * is invisible. Actually, Linux-2.4 also generates erratic
632 * ACKs in some curcumstances. 610 * ACKs in some circumstances.
633 */ 611 */
634 inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; 612 inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;
635 613
636 /* 2. Fixups made earlier cannot be right. 614 /* 2. Fixups made earlier cannot be right.
637 * If we do not estimate RTO correctly without them, 615 * If we do not estimate RTO correctly without them,
638 * all the algo is pure shit and should be replaced 616 * all the algo is pure shit and should be replaced
639 * with correct one. It is exaclty, which we pretend to do. 617 * with correct one. It is exactly, which we pretend to do.
640 */ 618 */
641} 619}
642 620
@@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk)
794 * to make it more realistic. 772 * to make it more realistic.
795 * 773 *
796 * A bit of theory. RTT is time passed after "normal" sized packet 774 * A bit of theory. RTT is time passed after "normal" sized packet
797 * is sent until it is ACKed. In normal curcumstances sending small 775 * is sent until it is ACKed. In normal circumstances sending small
798 * packets force peer to delay ACKs and calculation is correct too. 776 * packets force peer to delay ACKs and calculation is correct too.
799 * The algorithm is adaptive and, provided we follow specs, it 777 * The algorithm is adaptive and, provided we follow specs, it
800 * NEVER underestimate RTT. BUT! If peer tries to make some clever 778 * NEVER underestimate RTT. BUT! If peer tries to make some clever
@@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
919 int prior_fackets; 897 int prior_fackets;
920 u32 lost_retrans = 0; 898 u32 lost_retrans = 0;
921 int flag = 0; 899 int flag = 0;
900 int dup_sack = 0;
922 int i; 901 int i;
923 902
924 if (!tp->sacked_out) 903 if (!tp->sacked_out)
925 tp->fackets_out = 0; 904 tp->fackets_out = 0;
926 prior_fackets = tp->fackets_out; 905 prior_fackets = tp->fackets_out;
927 906
928 for (i=0; i<num_sacks; i++, sp++) { 907 /* SACK fastpath:
929 struct sk_buff *skb; 908 * if the only SACK change is the increase of the end_seq of
930 __u32 start_seq = ntohl(sp->start_seq); 909 * the first block then only apply that SACK block
931 __u32 end_seq = ntohl(sp->end_seq); 910 * and use retrans queue hinting otherwise slowpath */
932 int fack_count = 0; 911 flag = 1;
933 int dup_sack = 0; 912 for (i = 0; i< num_sacks; i++) {
913 __u32 start_seq = ntohl(sp[i].start_seq);
914 __u32 end_seq = ntohl(sp[i].end_seq);
915
916 if (i == 0){
917 if (tp->recv_sack_cache[i].start_seq != start_seq)
918 flag = 0;
919 } else {
920 if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
921 (tp->recv_sack_cache[i].end_seq != end_seq))
922 flag = 0;
923 }
924 tp->recv_sack_cache[i].start_seq = start_seq;
925 tp->recv_sack_cache[i].end_seq = end_seq;
934 926
935 /* Check for D-SACK. */ 927 /* Check for D-SACK. */
936 if (i == 0) { 928 if (i == 0) {
@@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
962 if (before(ack, prior_snd_una - tp->max_window)) 954 if (before(ack, prior_snd_una - tp->max_window))
963 return 0; 955 return 0;
964 } 956 }
957 }
958
959 if (flag)
960 num_sacks = 1;
961 else {
962 int j;
963 tp->fastpath_skb_hint = NULL;
964
965 /* order SACK blocks to allow in order walk of the retrans queue */
966 for (i = num_sacks-1; i > 0; i--) {
967 for (j = 0; j < i; j++){
968 if (after(ntohl(sp[j].start_seq),
969 ntohl(sp[j+1].start_seq))){
970 sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq);
971 sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq);
972 sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq);
973 sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq);
974 }
975
976 }
977 }
978 }
979
980 /* clear flag as used for different purpose in following code */
981 flag = 0;
982
983 for (i=0; i<num_sacks; i++, sp++) {
984 struct sk_buff *skb;
985 __u32 start_seq = ntohl(sp->start_seq);
986 __u32 end_seq = ntohl(sp->end_seq);
987 int fack_count;
988
989 /* Use SACK fastpath hint if valid */
990 if (tp->fastpath_skb_hint) {
991 skb = tp->fastpath_skb_hint;
992 fack_count = tp->fastpath_cnt_hint;
993 } else {
994 skb = sk->sk_write_queue.next;
995 fack_count = 0;
996 }
965 997
966 /* Event "B" in the comment above. */ 998 /* Event "B" in the comment above. */
967 if (after(end_seq, tp->high_seq)) 999 if (after(end_seq, tp->high_seq))
968 flag |= FLAG_DATA_LOST; 1000 flag |= FLAG_DATA_LOST;
969 1001
970 sk_stream_for_retrans_queue(skb, sk) { 1002 sk_stream_for_retrans_queue_from(skb, sk) {
971 int in_sack, pcount; 1003 int in_sack, pcount;
972 u8 sacked; 1004 u8 sacked;
973 1005
1006 tp->fastpath_skb_hint = skb;
1007 tp->fastpath_cnt_hint = fack_count;
1008
974 /* The retransmission queue is always in order, so 1009 /* The retransmission queue is always in order, so
975 * we can short-circuit the walk early. 1010 * we can short-circuit the walk early.
976 */ 1011 */
@@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1045 TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); 1080 TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
1046 tp->lost_out -= tcp_skb_pcount(skb); 1081 tp->lost_out -= tcp_skb_pcount(skb);
1047 tp->retrans_out -= tcp_skb_pcount(skb); 1082 tp->retrans_out -= tcp_skb_pcount(skb);
1083
1084 /* clear lost hint */
1085 tp->retransmit_skb_hint = NULL;
1048 } 1086 }
1049 } else { 1087 } else {
1050 /* New sack for not retransmitted frame, 1088 /* New sack for not retransmitted frame,
@@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1057 if (sacked & TCPCB_LOST) { 1095 if (sacked & TCPCB_LOST) {
1058 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 1096 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
1059 tp->lost_out -= tcp_skb_pcount(skb); 1097 tp->lost_out -= tcp_skb_pcount(skb);
1098
1099 /* clear lost hint */
1100 tp->retransmit_skb_hint = NULL;
1060 } 1101 }
1061 } 1102 }
1062 1103
@@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1080 (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { 1121 (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
1081 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1122 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1082 tp->retrans_out -= tcp_skb_pcount(skb); 1123 tp->retrans_out -= tcp_skb_pcount(skb);
1124 tp->retransmit_skb_hint = NULL;
1083 } 1125 }
1084 } 1126 }
1085 } 1127 }
@@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1107 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1149 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1108 tp->retrans_out -= tcp_skb_pcount(skb); 1150 tp->retrans_out -= tcp_skb_pcount(skb);
1109 1151
1152 /* clear lost hint */
1153 tp->retransmit_skb_hint = NULL;
1154
1110 if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { 1155 if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
1111 tp->lost_out += tcp_skb_pcount(skb); 1156 tp->lost_out += tcp_skb_pcount(skb);
1112 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1157 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
@@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk)
1214 tcp_set_ca_state(sk, TCP_CA_Loss); 1259 tcp_set_ca_state(sk, TCP_CA_Loss);
1215 tp->high_seq = tp->frto_highmark; 1260 tp->high_seq = tp->frto_highmark;
1216 TCP_ECN_queue_cwr(tp); 1261 TCP_ECN_queue_cwr(tp);
1262
1263 clear_all_retrans_hints(tp);
1217} 1264}
1218 1265
1219void tcp_clear_retrans(struct tcp_sock *tp) 1266void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how)
1251 tp->snd_cwnd_cnt = 0; 1298 tp->snd_cwnd_cnt = 0;
1252 tp->snd_cwnd_stamp = tcp_time_stamp; 1299 tp->snd_cwnd_stamp = tcp_time_stamp;
1253 1300
1301 tp->bytes_acked = 0;
1254 tcp_clear_retrans(tp); 1302 tcp_clear_retrans(tp);
1255 1303
1256 /* Push undo marker, if it was plain RTO and nothing 1304 /* Push undo marker, if it was plain RTO and nothing
@@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how)
1279 tcp_set_ca_state(sk, TCP_CA_Loss); 1327 tcp_set_ca_state(sk, TCP_CA_Loss);
1280 tp->high_seq = tp->snd_nxt; 1328 tp->high_seq = tp->snd_nxt;
1281 TCP_ECN_queue_cwr(tp); 1329 TCP_ECN_queue_cwr(tp);
1330
1331 clear_all_retrans_hints(tp);
1282} 1332}
1283 1333
1284static int tcp_check_sack_reneging(struct sock *sk) 1334static int tcp_check_sack_reneging(struct sock *sk)
@@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
1503 int packets, u32 high_seq) 1553 int packets, u32 high_seq)
1504{ 1554{
1505 struct sk_buff *skb; 1555 struct sk_buff *skb;
1506 int cnt = packets; 1556 int cnt;
1507 1557
1508 BUG_TRAP(cnt <= tp->packets_out); 1558 BUG_TRAP(packets <= tp->packets_out);
1559 if (tp->lost_skb_hint) {
1560 skb = tp->lost_skb_hint;
1561 cnt = tp->lost_cnt_hint;
1562 } else {
1563 skb = sk->sk_write_queue.next;
1564 cnt = 0;
1565 }
1509 1566
1510 sk_stream_for_retrans_queue(skb, sk) { 1567 sk_stream_for_retrans_queue_from(skb, sk) {
1511 cnt -= tcp_skb_pcount(skb); 1568 /* TODO: do this better */
1512 if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) 1569 /* this is not the most efficient way to do this... */
1570 tp->lost_skb_hint = skb;
1571 tp->lost_cnt_hint = cnt;
1572 cnt += tcp_skb_pcount(skb);
1573 if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq))
1513 break; 1574 break;
1514 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { 1575 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
1515 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1576 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1516 tp->lost_out += tcp_skb_pcount(skb); 1577 tp->lost_out += tcp_skb_pcount(skb);
1578
1579 /* clear xmit_retransmit_queue hints
1580 * if this is beyond hint */
1581 if(tp->retransmit_skb_hint != NULL &&
1582 before(TCP_SKB_CB(skb)->seq,
1583 TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) {
1584
1585 tp->retransmit_skb_hint = NULL;
1586 }
1517 } 1587 }
1518 } 1588 }
1519 tcp_sync_left_out(tp); 1589 tcp_sync_left_out(tp);
@@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
1540 if (tcp_head_timedout(sk, tp)) { 1610 if (tcp_head_timedout(sk, tp)) {
1541 struct sk_buff *skb; 1611 struct sk_buff *skb;
1542 1612
1543 sk_stream_for_retrans_queue(skb, sk) { 1613 skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
1544 if (tcp_skb_timedout(sk, skb) && 1614 : sk->sk_write_queue.next;
1545 !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { 1615
1616 sk_stream_for_retrans_queue_from(skb, sk) {
1617 if (!tcp_skb_timedout(sk, skb))
1618 break;
1619
1620 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
1546 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1621 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1547 tp->lost_out += tcp_skb_pcount(skb); 1622 tp->lost_out += tcp_skb_pcount(skb);
1623
1624 /* clear xmit_retrans hint */
1625 if (tp->retransmit_skb_hint &&
1626 before(TCP_SKB_CB(skb)->seq,
1627 TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
1628
1629 tp->retransmit_skb_hint = NULL;
1548 } 1630 }
1549 } 1631 }
1632
1633 tp->scoreboard_skb_hint = skb;
1634
1550 tcp_sync_left_out(tp); 1635 tcp_sync_left_out(tp);
1551 } 1636 }
1552} 1637}
@@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
1626 } 1711 }
1627 tcp_moderate_cwnd(tp); 1712 tcp_moderate_cwnd(tp);
1628 tp->snd_cwnd_stamp = tcp_time_stamp; 1713 tp->snd_cwnd_stamp = tcp_time_stamp;
1714
1715 /* There is something screwy going on with the retrans hints after
1716 an undo */
1717 clear_all_retrans_hints(tp);
1629} 1718}
1630 1719
1631static inline int tcp_may_undo(struct tcp_sock *tp) 1720static inline int tcp_may_undo(struct tcp_sock *tp)
@@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
1709 sk_stream_for_retrans_queue(skb, sk) { 1798 sk_stream_for_retrans_queue(skb, sk) {
1710 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 1799 TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
1711 } 1800 }
1801
1802 clear_all_retrans_hints(tp);
1803
1712 DBGUNDO(sk, tp, "partial loss"); 1804 DBGUNDO(sk, tp, "partial loss");
1713 tp->lost_out = 0; 1805 tp->lost_out = 0;
1714 tp->left_out = tp->sacked_out; 1806 tp->left_out = tp->sacked_out;
@@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1908 TCP_ECN_queue_cwr(tp); 2000 TCP_ECN_queue_cwr(tp);
1909 } 2001 }
1910 2002
2003 tp->bytes_acked = 0;
1911 tp->snd_cwnd_cnt = 0; 2004 tp->snd_cwnd_cnt = 0;
1912 tcp_set_ca_state(sk, TCP_CA_Recovery); 2005 tcp_set_ca_state(sk, TCP_CA_Recovery);
1913 } 2006 }
@@ -1919,9 +2012,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
1919} 2012}
1920 2013
1921/* Read draft-ietf-tcplw-high-performance before mucking 2014/* Read draft-ietf-tcplw-high-performance before mucking
1922 * with this code. (Superceeds RFC1323) 2015 * with this code. (Supersedes RFC1323)
1923 */ 2016 */
1924static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) 2017static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
1925{ 2018{
1926 /* RTTM Rule: A TSecr value received in a segment is used to 2019 /* RTTM Rule: A TSecr value received in a segment is used to
1927 * update the averaged RTT measurement only if the segment 2020 * update the averaged RTT measurement only if the segment
@@ -1932,7 +2025,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
1932 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> 2025 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
1933 * 2026 *
1934 * Changed: reset backoff as soon as we see the first valid sample. 2027 * Changed: reset backoff as soon as we see the first valid sample.
1935 * If we do not, we get strongly overstimated rto. With timestamps 2028 * If we do not, we get strongly overestimated rto. With timestamps
1936 * samples are accepted even from very old segments: f.e., when rtt=1 2029 * samples are accepted even from very old segments: f.e., when rtt=1
1937 * increases to 8, we retransmit 5 times and after 8 seconds delayed 2030 * increases to 8, we retransmit 5 times and after 8 seconds delayed
1938 * answer arrives rto becomes 120 seconds! If at least one of segments 2031 * answer arrives rto becomes 120 seconds! If at least one of segments
@@ -1940,13 +2033,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag)
1940 */ 2033 */
1941 struct tcp_sock *tp = tcp_sk(sk); 2034 struct tcp_sock *tp = tcp_sk(sk);
1942 const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; 2035 const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
1943 tcp_rtt_estimator(sk, seq_rtt, usrtt); 2036 tcp_rtt_estimator(sk, seq_rtt);
1944 tcp_set_rto(sk); 2037 tcp_set_rto(sk);
1945 inet_csk(sk)->icsk_backoff = 0; 2038 inet_csk(sk)->icsk_backoff = 0;
1946 tcp_bound_rto(sk); 2039 tcp_bound_rto(sk);
1947} 2040}
1948 2041
1949static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) 2042static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
1950{ 2043{
1951 /* We don't have a timestamp. Can only use 2044 /* We don't have a timestamp. Can only use
1952 * packets that are not retransmitted to determine 2045 * packets that are not retransmitted to determine
@@ -1960,21 +2053,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag
1960 if (flag & FLAG_RETRANS_DATA_ACKED) 2053 if (flag & FLAG_RETRANS_DATA_ACKED)
1961 return; 2054 return;
1962 2055
1963 tcp_rtt_estimator(sk, seq_rtt, usrtt); 2056 tcp_rtt_estimator(sk, seq_rtt);
1964 tcp_set_rto(sk); 2057 tcp_set_rto(sk);
1965 inet_csk(sk)->icsk_backoff = 0; 2058 inet_csk(sk)->icsk_backoff = 0;
1966 tcp_bound_rto(sk); 2059 tcp_bound_rto(sk);
1967} 2060}
1968 2061
1969static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, 2062static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
1970 const s32 seq_rtt, u32 *usrtt) 2063 const s32 seq_rtt)
1971{ 2064{
1972 const struct tcp_sock *tp = tcp_sk(sk); 2065 const struct tcp_sock *tp = tcp_sk(sk);
1973 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ 2066 /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
1974 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 2067 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
1975 tcp_ack_saw_tstamp(sk, usrtt, flag); 2068 tcp_ack_saw_tstamp(sk, flag);
1976 else if (seq_rtt >= 0) 2069 else if (seq_rtt >= 0)
1977 tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); 2070 tcp_ack_no_tstamp(sk, seq_rtt, flag);
1978} 2071}
1979 2072
1980static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, 2073static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
@@ -2054,20 +2147,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
2054 return acked; 2147 return acked;
2055} 2148}
2056 2149
2150static inline u32 tcp_usrtt(const struct sk_buff *skb)
2151{
2152 struct timeval tv, now;
2153
2154 do_gettimeofday(&now);
2155 skb_get_timestamp(skb, &tv);
2156 return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec);
2157}
2057 2158
2058/* Remove acknowledged frames from the retransmission queue. */ 2159/* Remove acknowledged frames from the retransmission queue. */
2059static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) 2160static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
2060{ 2161{
2061 struct tcp_sock *tp = tcp_sk(sk); 2162 struct tcp_sock *tp = tcp_sk(sk);
2163 const struct inet_connection_sock *icsk = inet_csk(sk);
2062 struct sk_buff *skb; 2164 struct sk_buff *skb;
2063 __u32 now = tcp_time_stamp; 2165 __u32 now = tcp_time_stamp;
2064 int acked = 0; 2166 int acked = 0;
2065 __s32 seq_rtt = -1; 2167 __s32 seq_rtt = -1;
2066 struct timeval usnow;
2067 u32 pkts_acked = 0; 2168 u32 pkts_acked = 0;
2068 2169 void (*rtt_sample)(struct sock *sk, u32 usrtt)
2069 if (seq_usrtt) 2170 = icsk->icsk_ca_ops->rtt_sample;
2070 do_gettimeofday(&usnow);
2071 2171
2072 while ((skb = skb_peek(&sk->sk_write_queue)) && 2172 while ((skb = skb_peek(&sk->sk_write_queue)) &&
2073 skb != sk->sk_send_head) { 2173 skb != sk->sk_send_head) {
@@ -2107,16 +2207,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2107 tp->retrans_out -= tcp_skb_pcount(skb); 2207 tp->retrans_out -= tcp_skb_pcount(skb);
2108 acked |= FLAG_RETRANS_DATA_ACKED; 2208 acked |= FLAG_RETRANS_DATA_ACKED;
2109 seq_rtt = -1; 2209 seq_rtt = -1;
2110 } else if (seq_rtt < 0) 2210 } else if (seq_rtt < 0) {
2111 seq_rtt = now - scb->when; 2211 seq_rtt = now - scb->when;
2112 if (seq_usrtt) { 2212 if (rtt_sample)
2113 struct timeval tv; 2213 (*rtt_sample)(sk, tcp_usrtt(skb));
2114
2115 skb_get_timestamp(skb, &tv);
2116 *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000
2117 + (usnow.tv_usec - tv.tv_usec);
2118 } 2214 }
2119
2120 if (sacked & TCPCB_SACKED_ACKED) 2215 if (sacked & TCPCB_SACKED_ACKED)
2121 tp->sacked_out -= tcp_skb_pcount(skb); 2216 tp->sacked_out -= tcp_skb_pcount(skb);
2122 if (sacked & TCPCB_LOST) 2217 if (sacked & TCPCB_LOST)
@@ -2126,17 +2221,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2126 !before(scb->end_seq, tp->snd_up)) 2221 !before(scb->end_seq, tp->snd_up))
2127 tp->urg_mode = 0; 2222 tp->urg_mode = 0;
2128 } 2223 }
2129 } else if (seq_rtt < 0) 2224 } else if (seq_rtt < 0) {
2130 seq_rtt = now - scb->when; 2225 seq_rtt = now - scb->when;
2226 if (rtt_sample)
2227 (*rtt_sample)(sk, tcp_usrtt(skb));
2228 }
2131 tcp_dec_pcount_approx(&tp->fackets_out, skb); 2229 tcp_dec_pcount_approx(&tp->fackets_out, skb);
2132 tcp_packets_out_dec(tp, skb); 2230 tcp_packets_out_dec(tp, skb);
2133 __skb_unlink(skb, &sk->sk_write_queue); 2231 __skb_unlink(skb, &sk->sk_write_queue);
2134 sk_stream_free_skb(sk, skb); 2232 sk_stream_free_skb(sk, skb);
2233 clear_all_retrans_hints(tp);
2135 } 2234 }
2136 2235
2137 if (acked&FLAG_ACKED) { 2236 if (acked&FLAG_ACKED) {
2138 const struct inet_connection_sock *icsk = inet_csk(sk); 2237 tcp_ack_update_rtt(sk, acked, seq_rtt);
2139 tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt);
2140 tcp_ack_packets_out(sk, tp); 2238 tcp_ack_packets_out(sk, tp);
2141 2239
2142 if (icsk->icsk_ca_ops->pkts_acked) 2240 if (icsk->icsk_ca_ops->pkts_acked)
@@ -2284,7 +2382,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
2284 } 2382 }
2285 2383
2286 /* F-RTO affects on two new ACKs following RTO. 2384 /* F-RTO affects on two new ACKs following RTO.
2287 * At latest on third ACK the TCP behavor is back to normal. 2385 * At latest on third ACK the TCP behavior is back to normal.
2288 */ 2386 */
2289 tp->frto_counter = (tp->frto_counter + 1) % 3; 2387 tp->frto_counter = (tp->frto_counter + 1) % 3;
2290} 2388}
@@ -2299,7 +2397,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2299 u32 ack = TCP_SKB_CB(skb)->ack_seq; 2397 u32 ack = TCP_SKB_CB(skb)->ack_seq;
2300 u32 prior_in_flight; 2398 u32 prior_in_flight;
2301 s32 seq_rtt; 2399 s32 seq_rtt;
2302 s32 seq_usrtt = 0;
2303 int prior_packets; 2400 int prior_packets;
2304 2401
2305 /* If the ack is newer than sent or older than previous acks 2402 /* If the ack is newer than sent or older than previous acks
@@ -2311,6 +2408,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2311 if (before(ack, prior_snd_una)) 2408 if (before(ack, prior_snd_una))
2312 goto old_ack; 2409 goto old_ack;
2313 2410
2411 if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR)
2412 tp->bytes_acked += ack - prior_snd_una;
2413
2314 if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { 2414 if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
2315 /* Window is constant, pure forward advance. 2415 /* Window is constant, pure forward advance.
2316 * No more checks are required. 2416 * No more checks are required.
@@ -2352,14 +2452,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
2352 prior_in_flight = tcp_packets_in_flight(tp); 2452 prior_in_flight = tcp_packets_in_flight(tp);
2353 2453
2354 /* See if we can take anything off of the retransmit queue. */ 2454 /* See if we can take anything off of the retransmit queue. */
2355 flag |= tcp_clean_rtx_queue(sk, &seq_rtt, 2455 flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
2356 icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL);
2357 2456
2358 if (tp->frto_counter) 2457 if (tp->frto_counter)
2359 tcp_process_frto(sk, prior_snd_una); 2458 tcp_process_frto(sk, prior_snd_una);
2360 2459
2361 if (tcp_ack_is_dubious(sk, flag)) { 2460 if (tcp_ack_is_dubious(sk, flag)) {
2362 /* Advanve CWND, if state allows this. */ 2461 /* Advance CWND, if state allows this. */
2363 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) 2462 if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
2364 tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); 2463 tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0);
2365 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); 2464 tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
@@ -3148,7 +3247,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list,
3148{ 3247{
3149 struct sk_buff *skb; 3248 struct sk_buff *skb;
3150 3249
3151 /* First, check that queue is collapsable and find 3250 /* First, check that queue is collapsible and find
3152 * the point where collapsing can be useful. */ 3251 * the point where collapsing can be useful. */
3153 for (skb = head; skb != tail; ) { 3252 for (skb = head; skb != tail; ) {
3154 /* No new bits? It is possible on ofo queue. */ 3253 /* No new bits? It is possible on ofo queue. */
@@ -3456,7 +3555,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk)
3456 3555
3457/* 3556/*
3458 * This routine is only called when we have urgent data 3557 * This routine is only called when we have urgent data
3459 * signalled. Its the 'slow' part of tcp_urg. It could be 3558 * signaled. Its the 'slow' part of tcp_urg. It could be
3460 * moved inline now as tcp_urg is only called from one 3559 * moved inline now as tcp_urg is only called from one
3461 * place. We handle URGent data wrong. We have to - as 3560 * place. We handle URGent data wrong. We have to - as
3462 * BSD still doesn't use the correction from RFC961. 3561 * BSD still doesn't use the correction from RFC961.
@@ -3501,7 +3600,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
3501 * urgent. To do this requires some care. We cannot just ignore 3600 * urgent. To do this requires some care. We cannot just ignore
3502 * tp->copied_seq since we would read the last urgent byte again 3601 * tp->copied_seq since we would read the last urgent byte again
3503 * as data, nor can we alter copied_seq until this data arrives 3602 * as data, nor can we alter copied_seq until this data arrives
3504 * or we break the sematics of SIOCATMARK (and thus sockatmark()) 3603 * or we break the semantics of SIOCATMARK (and thus sockatmark())
3505 * 3604 *
3506 * NOTE. Double Dutch. Rendering to plain English: author of comment 3605 * NOTE. Double Dutch. Rendering to plain English: author of comment
3507 * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); 3606 * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
@@ -3646,7 +3745,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3646 tp->rx_opt.saw_tstamp = 0; 3745 tp->rx_opt.saw_tstamp = 0;
3647 3746
3648 /* pred_flags is 0xS?10 << 16 + snd_wnd 3747 /* pred_flags is 0xS?10 << 16 + snd_wnd
3649 * if header_predition is to be made 3748 * if header_prediction is to be made
3650 * 'S' will always be tp->tcp_header_len >> 2 3749 * 'S' will always be tp->tcp_header_len >> 2
3651 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to 3750 * '?' will be 0 for the fast path, otherwise pred_flags is 0 to
3652 * turn it off (when there are holes in the receive 3751 * turn it off (when there are holes in the receive
@@ -4242,7 +4341,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4242 */ 4341 */
4243 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 4342 if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
4244 !tp->srtt) 4343 !tp->srtt)
4245 tcp_ack_saw_tstamp(sk, NULL, 0); 4344 tcp_ack_saw_tstamp(sk, 0);
4246 4345
4247 if (tp->rx_opt.tstamp_ok) 4346 if (tp->rx_opt.tstamp_ok)
4248 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 4347 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4372,6 +4471,7 @@ discard:
4372 4471
4373EXPORT_SYMBOL(sysctl_tcp_ecn); 4472EXPORT_SYMBOL(sysctl_tcp_ecn);
4374EXPORT_SYMBOL(sysctl_tcp_reordering); 4473EXPORT_SYMBOL(sysctl_tcp_reordering);
4474EXPORT_SYMBOL(sysctl_tcp_abc);
4375EXPORT_SYMBOL(tcp_parse_options); 4475EXPORT_SYMBOL(tcp_parse_options);
4376EXPORT_SYMBOL(tcp_rcv_established); 4476EXPORT_SYMBOL(tcp_rcv_established);
4377EXPORT_SYMBOL(tcp_rcv_state_process); 4477EXPORT_SYMBOL(tcp_rcv_state_process);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 634dabb558fd..4d5021e1929b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -39,7 +39,7 @@
39 * request_sock handling and moved 39 * request_sock handling and moved
40 * most of it into the af independent code. 40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes. 41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics. 42 * Added new listen semantics.
43 * Mike McLagan : Routing by source 43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits 44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes. 45 * Andi Kleen: various fixes.
@@ -1110,24 +1110,18 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1110static int tcp_v4_checksum_init(struct sk_buff *skb) 1110static int tcp_v4_checksum_init(struct sk_buff *skb)
1111{ 1111{
1112 if (skb->ip_summed == CHECKSUM_HW) { 1112 if (skb->ip_summed == CHECKSUM_HW) {
1113 skb->ip_summed = CHECKSUM_UNNECESSARY;
1114 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, 1113 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1115 skb->nh.iph->daddr, skb->csum)) 1114 skb->nh.iph->daddr, skb->csum)) {
1115 skb->ip_summed = CHECKSUM_UNNECESSARY;
1116 return 0; 1116 return 0;
1117 1117 }
1118 LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n");
1119 skb->ip_summed = CHECKSUM_NONE;
1120 } 1118 }
1119
1120 skb->csum = csum_tcpudp_nofold(skb->nh.iph->saddr, skb->nh.iph->daddr,
1121 skb->len, IPPROTO_TCP, 0);
1122
1121 if (skb->len <= 76) { 1123 if (skb->len <= 76) {
1122 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, 1124 return __skb_checksum_complete(skb);
1123 skb->nh.iph->daddr,
1124 skb_checksum(skb, 0, skb->len, 0)))
1125 return -1;
1126 skb->ip_summed = CHECKSUM_UNNECESSARY;
1127 } else {
1128 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1129 skb->nh.iph->saddr,
1130 skb->nh.iph->daddr, 0);
1131 } 1125 }
1132 return 0; 1126 return 0;
1133} 1127}
@@ -1216,10 +1210,10 @@ int tcp_v4_rcv(struct sk_buff *skb)
1216 1210
1217 /* An explanation is required here, I think. 1211 /* An explanation is required here, I think.
1218 * Packet length and doff are validated by header prediction, 1212 * Packet length and doff are validated by header prediction,
1219 * provided case of th->doff==0 is elimineted. 1213 * provided case of th->doff==0 is eliminated.
1220 * So, we defer the checks. */ 1214 * So, we defer the checks. */
1221 if ((skb->ip_summed != CHECKSUM_UNNECESSARY && 1215 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1222 tcp_v4_checksum_init(skb) < 0)) 1216 tcp_v4_checksum_init(skb)))
1223 goto bad_packet; 1217 goto bad_packet;
1224 1218
1225 th = skb->h.th; 1219 th = skb->h.th;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b1a63b2c6b4a..1b66a2ac4321 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -158,7 +158,7 @@ kill_with_rst:
158 /* I am shamed, but failed to make it more elegant. 158 /* I am shamed, but failed to make it more elegant.
159 * Yes, it is direct reference to IP, which is impossible 159 * Yes, it is direct reference to IP, which is impossible
160 * to generalize to IPv6. Taking into account that IPv6 160 * to generalize to IPv6. Taking into account that IPv6
161 * do not undertsnad recycling in any case, it not 161 * do not understand recycling in any case, it not
162 * a big problem in practice. --ANK */ 162 * a big problem in practice. --ANK */
163 if (tw->tw_family == AF_INET && 163 if (tw->tw_family == AF_INET &&
164 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && 164 tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp &&
@@ -194,7 +194,7 @@ kill_with_rst:
194 /* In window segment, it may be only reset or bare ack. */ 194 /* In window segment, it may be only reset or bare ack. */
195 195
196 if (th->rst) { 196 if (th->rst) {
197 /* This is TIME_WAIT assasination, in two flavors. 197 /* This is TIME_WAIT assassination, in two flavors.
198 * Oh well... nobody has a sufficient solution to this 198 * Oh well... nobody has a sufficient solution to this
199 * protocol bug yet. 199 * protocol bug yet.
200 */ 200 */
@@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
380 */ 380 */
381 newtp->snd_cwnd = 2; 381 newtp->snd_cwnd = 2;
382 newtp->snd_cwnd_cnt = 0; 382 newtp->snd_cwnd_cnt = 0;
383 newtp->bytes_acked = 0;
383 384
384 newtp->frto_counter = 0; 385 newtp->frto_counter = 0;
385 newtp->frto_highmark = 0; 386 newtp->frto_highmark = 0;
@@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
550 551
551 /* RFC793 page 36: "If the connection is in any non-synchronized state ... 552 /* RFC793 page 36: "If the connection is in any non-synchronized state ...
552 * and the incoming segment acknowledges something not yet 553 * and the incoming segment acknowledges something not yet
553 * sent (the segment carries an unaccaptable ACK) ... 554 * sent (the segment carries an unacceptable ACK) ...
554 * a reset is sent." 555 * a reset is sent."
555 * 556 *
556 * Invalid ACK: reset will be sent by listening socket 557 * Invalid ACK: reset will be sent by listening socket
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b907456a79f4..029c70dfb585 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
436 u16 flags; 436 u16 flags;
437 437
438 BUG_ON(len > skb->len); 438 BUG_ON(len > skb->len);
439
440 clear_all_retrans_hints(tp);
439 nsize = skb_headlen(skb) - len; 441 nsize = skb_headlen(skb) - len;
440 if (nsize < 0) 442 if (nsize < 0)
441 nsize = 0; 443 nsize = 0;
@@ -599,7 +601,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
599 for TCP options, but includes only bare TCP header. 601 for TCP options, but includes only bare TCP header.
600 602
601 tp->rx_opt.mss_clamp is mss negotiated at connection setup. 603 tp->rx_opt.mss_clamp is mss negotiated at connection setup.
602 It is minumum of user_mss and mss received with SYN. 604 It is minimum of user_mss and mss received with SYN.
603 It also does not include TCP options. 605 It also does not include TCP options.
604 606
605 tp->pmtu_cookie is last pmtu, seen by this function. 607 tp->pmtu_cookie is last pmtu, seen by this function.
@@ -1171,7 +1173,7 @@ u32 __tcp_select_window(struct sock *sk)
1171{ 1173{
1172 struct inet_connection_sock *icsk = inet_csk(sk); 1174 struct inet_connection_sock *icsk = inet_csk(sk);
1173 struct tcp_sock *tp = tcp_sk(sk); 1175 struct tcp_sock *tp = tcp_sk(sk);
1174 /* MSS for the peer's data. Previous verions used mss_clamp 1176 /* MSS for the peer's data. Previous versions used mss_clamp
1175 * here. I don't know if the value based on our guesses 1177 * here. I don't know if the value based on our guesses
1176 * of peer's MSS is better for the performance. It's more correct 1178 * of peer's MSS is better for the performance. It's more correct
1177 * but may be worse for the performance because of rcv_mss 1179 * but may be worse for the performance because of rcv_mss
@@ -1260,7 +1262,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
1260 BUG_ON(tcp_skb_pcount(skb) != 1 || 1262 BUG_ON(tcp_skb_pcount(skb) != 1 ||
1261 tcp_skb_pcount(next_skb) != 1); 1263 tcp_skb_pcount(next_skb) != 1);
1262 1264
1263 /* Ok. We will be able to collapse the packet. */ 1265 /* changing transmit queue under us so clear hints */
1266 clear_all_retrans_hints(tp);
1267
1268 /* Ok. We will be able to collapse the packet. */
1264 __skb_unlink(next_skb, &sk->sk_write_queue); 1269 __skb_unlink(next_skb, &sk->sk_write_queue);
1265 1270
1266 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); 1271 memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
@@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk)
1330 } 1335 }
1331 } 1336 }
1332 1337
1338 clear_all_retrans_hints(tp);
1339
1333 if (!lost) 1340 if (!lost)
1334 return; 1341 return;
1335 1342
@@ -1361,7 +1368,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1361 int err; 1368 int err;
1362 1369
1363 /* Do not sent more than we queued. 1/4 is reserved for possible 1370 /* Do not sent more than we queued. 1/4 is reserved for possible
1364 * copying overhead: frgagmentation, tunneling, mangling etc. 1371 * copying overhead: fragmentation, tunneling, mangling etc.
1365 */ 1372 */
1366 if (atomic_read(&sk->sk_wmem_alloc) > 1373 if (atomic_read(&sk->sk_wmem_alloc) >
1367 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) 1374 min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
@@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1468 const struct inet_connection_sock *icsk = inet_csk(sk); 1475 const struct inet_connection_sock *icsk = inet_csk(sk);
1469 struct tcp_sock *tp = tcp_sk(sk); 1476 struct tcp_sock *tp = tcp_sk(sk);
1470 struct sk_buff *skb; 1477 struct sk_buff *skb;
1471 int packet_cnt = tp->lost_out; 1478 int packet_cnt;
1479
1480 if (tp->retransmit_skb_hint) {
1481 skb = tp->retransmit_skb_hint;
1482 packet_cnt = tp->retransmit_cnt_hint;
1483 }else{
1484 skb = sk->sk_write_queue.next;
1485 packet_cnt = 0;
1486 }
1472 1487
1473 /* First pass: retransmit lost packets. */ 1488 /* First pass: retransmit lost packets. */
1474 if (packet_cnt) { 1489 if (tp->lost_out) {
1475 sk_stream_for_retrans_queue(skb, sk) { 1490 sk_stream_for_retrans_queue_from(skb, sk) {
1476 __u8 sacked = TCP_SKB_CB(skb)->sacked; 1491 __u8 sacked = TCP_SKB_CB(skb)->sacked;
1477 1492
1493 /* we could do better than to assign each time */
1494 tp->retransmit_skb_hint = skb;
1495 tp->retransmit_cnt_hint = packet_cnt;
1496
1478 /* Assume this retransmit will generate 1497 /* Assume this retransmit will generate
1479 * only one packet for congestion window 1498 * only one packet for congestion window
1480 * calculation purposes. This works because 1499 * calculation purposes. This works because
@@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1485 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) 1504 if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
1486 return; 1505 return;
1487 1506
1488 if (sacked&TCPCB_LOST) { 1507 if (sacked & TCPCB_LOST) {
1489 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { 1508 if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
1490 if (tcp_retransmit_skb(sk, skb)) 1509 if (tcp_retransmit_skb(sk, skb)) {
1510 tp->retransmit_skb_hint = NULL;
1491 return; 1511 return;
1512 }
1492 if (icsk->icsk_ca_state != TCP_CA_Loss) 1513 if (icsk->icsk_ca_state != TCP_CA_Loss)
1493 NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); 1514 NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
1494 else 1515 else
@@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1501 TCP_RTO_MAX); 1522 TCP_RTO_MAX);
1502 } 1523 }
1503 1524
1504 packet_cnt -= tcp_skb_pcount(skb); 1525 packet_cnt += tcp_skb_pcount(skb);
1505 if (packet_cnt <= 0) 1526 if (packet_cnt >= tp->lost_out)
1506 break; 1527 break;
1507 } 1528 }
1508 } 1529 }
@@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1528 if (tcp_may_send_now(sk, tp)) 1549 if (tcp_may_send_now(sk, tp))
1529 return; 1550 return;
1530 1551
1531 packet_cnt = 0; 1552 if (tp->forward_skb_hint) {
1553 skb = tp->forward_skb_hint;
1554 packet_cnt = tp->forward_cnt_hint;
1555 } else{
1556 skb = sk->sk_write_queue.next;
1557 packet_cnt = 0;
1558 }
1559
1560 sk_stream_for_retrans_queue_from(skb, sk) {
1561 tp->forward_cnt_hint = packet_cnt;
1562 tp->forward_skb_hint = skb;
1532 1563
1533 sk_stream_for_retrans_queue(skb, sk) {
1534 /* Similar to the retransmit loop above we 1564 /* Similar to the retransmit loop above we
1535 * can pretend that the retransmitted SKB 1565 * can pretend that the retransmitted SKB
1536 * we send out here will be composed of one 1566 * we send out here will be composed of one
@@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
1547 continue; 1577 continue;
1548 1578
1549 /* Ok, retransmit it. */ 1579 /* Ok, retransmit it. */
1550 if (tcp_retransmit_skb(sk, skb)) 1580 if (tcp_retransmit_skb(sk, skb)) {
1581 tp->forward_skb_hint = NULL;
1551 break; 1582 break;
1583 }
1552 1584
1553 if (skb == skb_peek(&sk->sk_write_queue)) 1585 if (skb == skb_peek(&sk->sk_write_queue))
1554 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 1586 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
@@ -2058,3 +2090,4 @@ EXPORT_SYMBOL(tcp_connect);
2058EXPORT_SYMBOL(tcp_make_synack); 2090EXPORT_SYMBOL(tcp_make_synack);
2059EXPORT_SYMBOL(tcp_simple_retransmit); 2091EXPORT_SYMBOL(tcp_simple_retransmit);
2060EXPORT_SYMBOL(tcp_sync_mss); 2092EXPORT_SYMBOL(tcp_sync_mss);
2093EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 327770bf5522..26d7486ee501 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt,
20 u32 in_flight, int flag) 20 u32 in_flight, int flag)
21{ 21{
22 struct tcp_sock *tp = tcp_sk(sk); 22 struct tcp_sock *tp = tcp_sk(sk);
23 if (in_flight < tp->snd_cwnd) 23
24 if (!tcp_is_cwnd_limited(sk, in_flight))
24 return; 25 return;
25 26
26 if (tp->snd_cwnd <= tp->snd_ssthresh) { 27 if (tp->snd_cwnd <= tp->snd_ssthresh)
27 tp->snd_cwnd++; 28 tcp_slow_start(tp);
28 } else { 29 else {
29 tp->snd_cwnd_cnt++; 30 tp->snd_cwnd_cnt++;
30 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ 31 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
31 tp->snd_cwnd++; 32 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
33 tp->snd_cwnd++;
32 tp->snd_cwnd_cnt = 0; 34 tp->snd_cwnd_cnt = 0;
33 } 35 }
34 } 36 }
35 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
36 tp->snd_cwnd_stamp = tcp_time_stamp;
37} 37}
38 38
39static u32 tcp_scalable_ssthresh(struct sock *sk) 39static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 415ee47ac1c5..e1880959614a 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk)
58 * to prevent DoS attacks. It is called when a retransmission timeout 58 * to prevent DoS attacks. It is called when a retransmission timeout
59 * or zero probe timeout occurs on orphaned socket. 59 * or zero probe timeout occurs on orphaned socket.
60 * 60 *
61 * Criterium is still not confirmed experimentally and may change. 61 * Criteria is still not confirmed experimentally and may change.
62 * We kill the socket, if: 62 * We kill the socket, if:
63 * 1. If number of orphaned sockets exceeds an administratively configured 63 * 1. If number of orphaned sockets exceeds an administratively configured
64 * limit. 64 * limit.
@@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk)
132 hole detection. :-( 132 hole detection. :-(
133 133
134 It is place to make it. It is not made. I do not want 134 It is place to make it. It is not made. I do not want
135 to make it. It is disguisting. It does not work in any 135 to make it. It is disgusting. It does not work in any
136 case. Let me to cite the same draft, which requires for 136 case. Let me to cite the same draft, which requires for
137 us to implement this: 137 us to implement this:
138 138
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 93c5f92070f9..4376814d29fb 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
236 /* We don't have enough RTT samples to do the Vegas 236 /* We don't have enough RTT samples to do the Vegas
237 * calculation, so we'll behave like Reno. 237 * calculation, so we'll behave like Reno.
238 */ 238 */
239 if (tp->snd_cwnd > tp->snd_ssthresh) 239 tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, cnt);
240 tp->snd_cwnd++;
241 } else { 240 } else {
242 u32 rtt, target_cwnd, diff; 241 u32 rtt, target_cwnd, diff;
243 242
@@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
275 */ 274 */
276 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; 275 diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
277 276
278 if (tp->snd_cwnd < tp->snd_ssthresh) { 277 if (tp->snd_cwnd <= tp->snd_ssthresh) {
279 /* Slow start. */ 278 /* Slow start. */
280 if (diff > gamma) { 279 if (diff > gamma) {
281 /* Going too fast. Time to slow down 280 /* Going too fast. Time to slow down
@@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
295 V_PARAM_SHIFT)+1); 294 V_PARAM_SHIFT)+1);
296 295
297 } 296 }
297 tcp_slow_start(tp);
298 } else { 298 } else {
299 /* Congestion avoidance. */ 299 /* Congestion avoidance. */
300 u32 next_snd_cwnd; 300 u32 next_snd_cwnd;
@@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack,
327 else if (next_snd_cwnd < tp->snd_cwnd) 327 else if (next_snd_cwnd < tp->snd_cwnd)
328 tp->snd_cwnd--; 328 tp->snd_cwnd--;
329 } 329 }
330 }
331 330
332 /* Wipe the slate clean for the next RTT. */ 331 if (tp->snd_cwnd < 2)
333 vegas->cntRTT = 0; 332 tp->snd_cwnd = 2;
334 vegas->minRTT = 0x7fffffff; 333 else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
334 tp->snd_cwnd = tp->snd_cwnd_clamp;
335 }
335 } 336 }
336 337
337 /* The following code is executed for every ack we receive, 338 /* Wipe the slate clean for the next RTT. */
338 * except for conditions checked in should_advance_cwnd() 339 vegas->cntRTT = 0;
339 * before the call to tcp_cong_avoid(). Mainly this means that 340 vegas->minRTT = 0x7fffffff;
340 * we only execute this code if the ack actually acked some
341 * data.
342 */
343
344 /* If we are in slow start, increase our cwnd in response to this ACK.
345 * (If we are not in slow start then we are in congestion avoidance,
346 * and adjust our congestion window only once per RTT. See the code
347 * above.)
348 */
349 if (tp->snd_cwnd <= tp->snd_ssthresh)
350 tp->snd_cwnd++;
351
352 /* to keep cwnd from growing without bound */
353 tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
354
355 /* Make sure that we are never so timid as to reduce our cwnd below
356 * 2 MSS.
357 *
358 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
359 */
360 tp->snd_cwnd = max(tp->snd_cwnd, 2U);
361} 341}
362 342
363/* Extract info for Tcp socket info provided via netlink. */ 343/* Extract info for Tcp socket info provided via netlink. */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e0bd1013cb0d..2422a5f7195d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -761,7 +761,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
761 761
762static __inline__ int __udp_checksum_complete(struct sk_buff *skb) 762static __inline__ int __udp_checksum_complete(struct sk_buff *skb)
763{ 763{
764 return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); 764 return __skb_checksum_complete(skb);
765} 765}
766 766
767static __inline__ int udp_checksum_complete(struct sk_buff *skb) 767static __inline__ int udp_checksum_complete(struct sk_buff *skb)
@@ -1100,11 +1100,8 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
1100 if (uh->check == 0) { 1100 if (uh->check == 0) {
1101 skb->ip_summed = CHECKSUM_UNNECESSARY; 1101 skb->ip_summed = CHECKSUM_UNNECESSARY;
1102 } else if (skb->ip_summed == CHECKSUM_HW) { 1102 } else if (skb->ip_summed == CHECKSUM_HW) {
1103 skb->ip_summed = CHECKSUM_UNNECESSARY;
1104 if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) 1103 if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
1105 return 0; 1104 skb->ip_summed = CHECKSUM_UNNECESSARY;
1106 LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n");
1107 skb->ip_summed = CHECKSUM_NONE;
1108 } 1105 }
1109 if (skb->ip_summed != CHECKSUM_UNNECESSARY) 1106 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
1110 skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); 1107 skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 23e540365a14..1bdf0fb8bf8a 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -585,17 +585,16 @@ static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
585 daddr = &skb->nh.ipv6h->daddr; 585 daddr = &skb->nh.ipv6h->daddr;
586 586
587 /* Perform checksum. */ 587 /* Perform checksum. */
588 if (skb->ip_summed == CHECKSUM_HW) { 588 switch (skb->ip_summed) {
589 skb->ip_summed = CHECKSUM_UNNECESSARY; 589 case CHECKSUM_HW:
590 if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, 590 if (!csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6,
591 skb->csum)) { 591 skb->csum))
592 LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 hw checksum failed\n"); 592 break;
593 skb->ip_summed = CHECKSUM_NONE; 593 /* fall through */
594 } 594 case CHECKSUM_NONE:
595 } 595 skb->csum = ~csum_ipv6_magic(saddr, daddr, skb->len,
596 if (skb->ip_summed == CHECKSUM_NONE) { 596 IPPROTO_ICMPV6, 0);
597 if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, 597 if (__skb_checksum_complete(skb)) {
598 skb_checksum(skb, 0, skb->len, 0))) {
599 LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", 598 LIMIT_NETDEBUG(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n",
600 NIP6(*saddr), NIP6(*daddr)); 599 NIP6(*saddr), NIP6(*daddr));
601 goto discard_it; 600 goto discard_it;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 651c79b41eeb..8e9628f1c4c5 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -298,13 +298,10 @@ void rawv6_err(struct sock *sk, struct sk_buff *skb,
298static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) 298static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb)
299{ 299{
300 if ((raw6_sk(sk)->checksum || sk->sk_filter) && 300 if ((raw6_sk(sk)->checksum || sk->sk_filter) &&
301 skb->ip_summed != CHECKSUM_UNNECESSARY) { 301 skb_checksum_complete(skb)) {
302 if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { 302 /* FIXME: increment a raw6 drops counter here */
303 /* FIXME: increment a raw6 drops counter here */ 303 kfree_skb(skb);
304 kfree_skb(skb); 304 return 0;
305 return 0;
306 }
307 skb->ip_summed = CHECKSUM_UNNECESSARY;
308 } 305 }
309 306
310 /* Charge it to the socket. */ 307 /* Charge it to the socket. */
@@ -337,32 +334,25 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
337 if (!rp->checksum) 334 if (!rp->checksum)
338 skb->ip_summed = CHECKSUM_UNNECESSARY; 335 skb->ip_summed = CHECKSUM_UNNECESSARY;
339 336
340 if (skb->ip_summed != CHECKSUM_UNNECESSARY) { 337 if (skb->ip_summed == CHECKSUM_HW) {
341 if (skb->ip_summed == CHECKSUM_HW) { 338 skb_postpull_rcsum(skb, skb->nh.raw,
342 skb_postpull_rcsum(skb, skb->nh.raw, 339 skb->h.raw - skb->nh.raw);
343 skb->h.raw - skb->nh.raw); 340 if (!csum_ipv6_magic(&skb->nh.ipv6h->saddr,
341 &skb->nh.ipv6h->daddr,
342 skb->len, inet->num, skb->csum))
344 skb->ip_summed = CHECKSUM_UNNECESSARY; 343 skb->ip_summed = CHECKSUM_UNNECESSARY;
345 if (csum_ipv6_magic(&skb->nh.ipv6h->saddr,
346 &skb->nh.ipv6h->daddr,
347 skb->len, inet->num, skb->csum)) {
348 LIMIT_NETDEBUG(KERN_DEBUG "raw v6 hw csum failure.\n");
349 skb->ip_summed = CHECKSUM_NONE;
350 }
351 }
352 if (skb->ip_summed == CHECKSUM_NONE)
353 skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr,
354 &skb->nh.ipv6h->daddr,
355 skb->len, inet->num, 0);
356 } 344 }
345 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
346 skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr,
347 &skb->nh.ipv6h->daddr,
348 skb->len, inet->num, 0);
357 349
358 if (inet->hdrincl) { 350 if (inet->hdrincl) {
359 if (skb->ip_summed != CHECKSUM_UNNECESSARY && 351 if (skb_checksum_complete(skb)) {
360 (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) {
361 /* FIXME: increment a raw6 drops counter here */ 352 /* FIXME: increment a raw6 drops counter here */
362 kfree_skb(skb); 353 kfree_skb(skb);
363 return 0; 354 return 0;
364 } 355 }
365 skb->ip_summed = CHECKSUM_UNNECESSARY;
366 } 356 }
367 357
368 rawv6_rcv_skb(sk, skb); 358 rawv6_rcv_skb(sk, skb);
@@ -407,7 +397,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
407 if (skb->ip_summed==CHECKSUM_UNNECESSARY) { 397 if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
408 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); 398 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
409 } else if (msg->msg_flags&MSG_TRUNC) { 399 } else if (msg->msg_flags&MSG_TRUNC) {
410 if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) 400 if (__skb_checksum_complete(skb))
411 goto csum_copy_err; 401 goto csum_copy_err;
412 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); 402 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
413 } else { 403 } else {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d746d3b27efb..62c0e5bd931c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1401,20 +1401,18 @@ out:
1401static int tcp_v6_checksum_init(struct sk_buff *skb) 1401static int tcp_v6_checksum_init(struct sk_buff *skb)
1402{ 1402{
1403 if (skb->ip_summed == CHECKSUM_HW) { 1403 if (skb->ip_summed == CHECKSUM_HW) {
1404 skb->ip_summed = CHECKSUM_UNNECESSARY;
1405 if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, 1404 if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
1406 &skb->nh.ipv6h->daddr,skb->csum)) 1405 &skb->nh.ipv6h->daddr,skb->csum)) {
1406 skb->ip_summed = CHECKSUM_UNNECESSARY;
1407 return 0; 1407 return 0;
1408 LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v6 csum failed\n"); 1408 }
1409 } 1409 }
1410
1411 skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
1412 &skb->nh.ipv6h->daddr, 0);
1413
1410 if (skb->len <= 76) { 1414 if (skb->len <= 76) {
1411 if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, 1415 return __skb_checksum_complete(skb);
1412 &skb->nh.ipv6h->daddr,skb_checksum(skb, 0, skb->len, 0)))
1413 return -1;
1414 skb->ip_summed = CHECKSUM_UNNECESSARY;
1415 } else {
1416 skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr,
1417 &skb->nh.ipv6h->daddr,0);
1418 } 1416 }
1419 return 0; 1417 return 0;
1420} 1418}
@@ -1575,7 +1573,7 @@ static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
1575 goto discard_it; 1573 goto discard_it;
1576 1574
1577 if ((skb->ip_summed != CHECKSUM_UNNECESSARY && 1575 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1578 tcp_v6_checksum_init(skb) < 0)) 1576 tcp_v6_checksum_init(skb)))
1579 goto bad_packet; 1577 goto bad_packet;
1580 1578
1581 th = skb->h.th; 1579 th = skb->h.th;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index bf9519341fd3..e671153b47b2 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -248,7 +248,7 @@ try_again:
248 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, 248 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
249 copied); 249 copied);
250 } else if (msg->msg_flags&MSG_TRUNC) { 250 } else if (msg->msg_flags&MSG_TRUNC) {
251 if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) 251 if (__skb_checksum_complete(skb))
252 goto csum_copy_err; 252 goto csum_copy_err;
253 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, 253 err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
254 copied); 254 copied);
@@ -363,13 +363,10 @@ static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
363 return -1; 363 return -1;
364 } 364 }
365 365
366 if (skb->ip_summed != CHECKSUM_UNNECESSARY) { 366 if (skb_checksum_complete(skb)) {
367 if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { 367 UDP6_INC_STATS_BH(UDP_MIB_INERRORS);
368 UDP6_INC_STATS_BH(UDP_MIB_INERRORS); 368 kfree_skb(skb);
369 kfree_skb(skb); 369 return 0;
370 return 0;
371 }
372 skb->ip_summed = CHECKSUM_UNNECESSARY;
373 } 370 }
374 371
375 if (sock_queue_rcv_skb(sk,skb)<0) { 372 if (sock_queue_rcv_skb(sk,skb)<0) {
@@ -491,13 +488,10 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
491 uh = skb->h.uh; 488 uh = skb->h.uh;
492 } 489 }
493 490
494 if (skb->ip_summed==CHECKSUM_HW) { 491 if (skb->ip_summed == CHECKSUM_HW &&
492 !csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum))
495 skb->ip_summed = CHECKSUM_UNNECESSARY; 493 skb->ip_summed = CHECKSUM_UNNECESSARY;
496 if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) { 494
497 LIMIT_NETDEBUG(KERN_DEBUG "udp v6 hw csum failure.\n");
498 skb->ip_summed = CHECKSUM_NONE;
499 }
500 }
501 if (skb->ip_summed != CHECKSUM_UNNECESSARY) 495 if (skb->ip_summed != CHECKSUM_UNNECESSARY)
502 skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0); 496 skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0);
503 497
@@ -521,8 +515,7 @@ static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
521 if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) 515 if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
522 goto discard; 516 goto discard;
523 517
524 if (skb->ip_summed != CHECKSUM_UNNECESSARY && 518 if (skb_checksum_complete(skb))
525 (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)))
526 goto discard; 519 goto discard;
527 UDP6_INC_STATS_BH(UDP_MIB_NOPORTS); 520 UDP6_INC_STATS_BH(UDP_MIB_NOPORTS);
528 521
diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c
index 122c086ee2db..dbe6105e83a5 100644
--- a/net/rxrpc/transport.c
+++ b/net/rxrpc/transport.c
@@ -23,6 +23,7 @@
23#include <linux/in.h> 23#include <linux/in.h>
24#include <linux/in6.h> 24#include <linux/in6.h>
25#include <linux/icmp.h> 25#include <linux/icmp.h>
26#include <linux/skbuff.h>
26#include <net/sock.h> 27#include <net/sock.h>
27#include <net/ip.h> 28#include <net/ip.h>
28#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) 29#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
@@ -475,15 +476,11 @@ void rxrpc_trans_receive_packet(struct rxrpc_transport *trans)
475 476
476 /* we'll probably need to checksum it (didn't call 477 /* we'll probably need to checksum it (didn't call
477 * sock_recvmsg) */ 478 * sock_recvmsg) */
478 if (pkt->ip_summed != CHECKSUM_UNNECESSARY) { 479 if (skb_checksum_complete(pkt)) {
479 if ((unsigned short) 480 kfree_skb(pkt);
480 csum_fold(skb_checksum(pkt, 0, pkt->len, 481 rxrpc_krxiod_queue_transport(trans);
481 pkt->csum))) { 482 _leave(" CSUM failed");
482 kfree_skb(pkt); 483 return;
483 rxrpc_krxiod_queue_transport(trans);
484 _leave(" CSUM failed");
485 return;
486 }
487 } 484 }
488 485
489 addr = pkt->nh.iph->saddr; 486 addr = pkt->nh.iph->saddr;
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 8f97e90f36c8..eb330d4f66d6 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -6,6 +6,9 @@
6 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> 6 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
7 */ 7 */
8 8
9#include <linux/compiler.h>
10#include <linux/netdevice.h>
11#include <linux/skbuff.h>
9#include <linux/types.h> 12#include <linux/types.h>
10#include <linux/pagemap.h> 13#include <linux/pagemap.h>
11#include <linux/udp.h> 14#include <linux/udp.h>
@@ -165,6 +168,8 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
165 return -1; 168 return -1;
166 if ((unsigned short)csum_fold(desc.csum)) 169 if ((unsigned short)csum_fold(desc.csum))
167 return -1; 170 return -1;
171 if (unlikely(skb->ip_summed == CHECKSUM_HW))
172 netdev_rx_csum_fault(skb->dev);
168 return 0; 173 return 0;
169no_checksum: 174no_checksum:
170 if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0) 175 if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0)
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index f16e7cdd6150..e50e7cf43737 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -623,12 +623,9 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
623 /* we can use it in-place */ 623 /* we can use it in-place */
624 rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); 624 rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr);
625 rqstp->rq_arg.head[0].iov_len = len; 625 rqstp->rq_arg.head[0].iov_len = len;
626 if (skb->ip_summed != CHECKSUM_UNNECESSARY) { 626 if (skb_checksum_complete(skb)) {
627 if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { 627 skb_free_datagram(svsk->sk_sk, skb);
628 skb_free_datagram(svsk->sk_sk, skb); 628 return 0;
629 return 0;
630 }
631 skb->ip_summed = CHECKSUM_UNNECESSARY;
632 } 629 }
633 rqstp->rq_skbuff = skb; 630 rqstp->rq_skbuff = skb;
634 } 631 }