diff options
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 5 | ||||
-rw-r--r-- | drivers/sbus/char/display7seg.c | 2 | ||||
-rw-r--r-- | drivers/scsi/lpfc/lpfc_init.c | 1 | ||||
-rw-r--r-- | include/linux/sysctl.h | 1 | ||||
-rw-r--r-- | include/linux/tcp.h | 16 | ||||
-rw-r--r-- | include/net/sock.h | 6 | ||||
-rw-r--r-- | include/net/tcp.h | 71 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 8 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_bic.c | 12 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 40 | ||||
-rw-r--r-- | net/ipv4/tcp_highspeed.c | 11 | ||||
-rw-r--r-- | net/ipv4/tcp_htcp.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp_hybla.c | 6 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 288 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 7 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 61 | ||||
-rw-r--r-- | net/ipv4/tcp_scalable.c | 14 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_vegas.c | 42 |
21 files changed, 414 insertions, 201 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 65895bb51414..ebc09a159f62 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
@@ -78,6 +78,11 @@ inet_peer_gc_maxtime - INTEGER | |||
78 | 78 | ||
79 | TCP variables: | 79 | TCP variables: |
80 | 80 | ||
81 | tcp_abc - INTEGER | ||
82 | Controls Appropriate Byte Count defined in RFC3465. If set to | ||
83 | 0 then does congestion avoid once per ack. 1 is conservative | ||
84 | value, and 2 is more agressive. | ||
85 | |||
81 | tcp_syn_retries - INTEGER | 86 | tcp_syn_retries - INTEGER |
82 | Number of times initial SYNs for an active TCP connection attempt | 87 | Number of times initial SYNs for an active TCP connection attempt |
83 | will be retransmitted. Should not be higher than 255. Default value | 88 | will be retransmitted. Should not be higher than 255. Default value |
diff --git a/drivers/sbus/char/display7seg.c b/drivers/sbus/char/display7seg.c index 2c86a4b809cd..c3a51d1fae5d 100644 --- a/drivers/sbus/char/display7seg.c +++ b/drivers/sbus/char/display7seg.c | |||
@@ -119,7 +119,7 @@ static long d7s_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
119 | { | 119 | { |
120 | __u8 regs = readb(d7s_regs); | 120 | __u8 regs = readb(d7s_regs); |
121 | __u8 ireg = 0; | 121 | __u8 ireg = 0; |
122 | int error = 0 | 122 | int error = 0; |
123 | 123 | ||
124 | if (D7S_MINOR != iminor(file->f_dentry->d_inode)) | 124 | if (D7S_MINOR != iminor(file->f_dentry->d_inode)) |
125 | return -ENODEV; | 125 | return -ENODEV; |
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index c90723860a04..07498118359d 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c | |||
@@ -1704,7 +1704,6 @@ MODULE_DEVICE_TABLE(pci, lpfc_id_table); | |||
1704 | 1704 | ||
1705 | static struct pci_driver lpfc_driver = { | 1705 | static struct pci_driver lpfc_driver = { |
1706 | .name = LPFC_DRIVER_NAME, | 1706 | .name = LPFC_DRIVER_NAME, |
1707 | .owner = THIS_MODULE, | ||
1708 | .id_table = lpfc_id_table, | 1707 | .id_table = lpfc_id_table, |
1709 | .probe = lpfc_pci_probe_one, | 1708 | .probe = lpfc_pci_probe_one, |
1710 | .remove = __devexit_p(lpfc_pci_remove_one), | 1709 | .remove = __devexit_p(lpfc_pci_remove_one), |
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 22cf5e1ac987..ab2791b3189d 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h | |||
@@ -390,6 +390,7 @@ enum | |||
390 | NET_TCP_BIC_BETA=108, | 390 | NET_TCP_BIC_BETA=108, |
391 | NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, | 391 | NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, |
392 | NET_TCP_CONG_CONTROL=110, | 392 | NET_TCP_CONG_CONTROL=110, |
393 | NET_TCP_ABC=111, | ||
393 | }; | 394 | }; |
394 | 395 | ||
395 | enum { | 396 | enum { |
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index ac4ca44c75ca..0e1da6602e05 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h | |||
@@ -307,6 +307,21 @@ struct tcp_sock { | |||
307 | struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ | 307 | struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ |
308 | struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ | 308 | struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ |
309 | 309 | ||
310 | struct tcp_sack_block recv_sack_cache[4]; | ||
311 | |||
312 | /* from STCP, retrans queue hinting */ | ||
313 | struct sk_buff* lost_skb_hint; | ||
314 | |||
315 | struct sk_buff *scoreboard_skb_hint; | ||
316 | struct sk_buff *retransmit_skb_hint; | ||
317 | struct sk_buff *forward_skb_hint; | ||
318 | struct sk_buff *fastpath_skb_hint; | ||
319 | |||
320 | int fastpath_cnt_hint; | ||
321 | int lost_cnt_hint; | ||
322 | int retransmit_cnt_hint; | ||
323 | int forward_cnt_hint; | ||
324 | |||
310 | __u16 advmss; /* Advertised MSS */ | 325 | __u16 advmss; /* Advertised MSS */ |
311 | __u16 prior_ssthresh; /* ssthresh saved at recovery start */ | 326 | __u16 prior_ssthresh; /* ssthresh saved at recovery start */ |
312 | __u32 lost_out; /* Lost packets */ | 327 | __u32 lost_out; /* Lost packets */ |
@@ -326,6 +341,7 @@ struct tcp_sock { | |||
326 | __u32 snd_up; /* Urgent pointer */ | 341 | __u32 snd_up; /* Urgent pointer */ |
327 | 342 | ||
328 | __u32 total_retrans; /* Total retransmits for entire connection */ | 343 | __u32 total_retrans; /* Total retransmits for entire connection */ |
344 | __u32 bytes_acked; /* Appropriate Byte Counting - RFC3465 */ | ||
329 | 345 | ||
330 | unsigned int keepalive_time; /* time before keep alive takes place */ | 346 | unsigned int keepalive_time; /* time before keep alive takes place */ |
331 | unsigned int keepalive_intvl; /* time interval between keep alive probes */ | 347 | unsigned int keepalive_intvl; /* time interval between keep alive probes */ |
diff --git a/include/net/sock.h b/include/net/sock.h index ff13c4cc287a..982b4ecd187b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
@@ -1247,6 +1247,12 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk) | |||
1247 | (skb != (struct sk_buff *)&(sk)->sk_write_queue); \ | 1247 | (skb != (struct sk_buff *)&(sk)->sk_write_queue); \ |
1248 | skb = skb->next) | 1248 | skb = skb->next) |
1249 | 1249 | ||
1250 | /*from STCP for fast SACK Process*/ | ||
1251 | #define sk_stream_for_retrans_queue_from(skb, sk) \ | ||
1252 | for (; (skb != (sk)->sk_send_head) && \ | ||
1253 | (skb != (struct sk_buff *)&(sk)->sk_write_queue); \ | ||
1254 | skb = skb->next) | ||
1255 | |||
1250 | /* | 1256 | /* |
1251 | * Default write policy as shown to user space via poll/select/SIGIO | 1257 | * Default write policy as shown to user space via poll/select/SIGIO |
1252 | */ | 1258 | */ |
diff --git a/include/net/tcp.h b/include/net/tcp.h index 96cc3b434e40..0f9848011972 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
@@ -89,10 +89,10 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); | |||
89 | */ | 89 | */ |
90 | 90 | ||
91 | #define TCP_SYN_RETRIES 5 /* number of times to retry active opening a | 91 | #define TCP_SYN_RETRIES 5 /* number of times to retry active opening a |
92 | * connection: ~180sec is RFC minumum */ | 92 | * connection: ~180sec is RFC minimum */ |
93 | 93 | ||
94 | #define TCP_SYNACK_RETRIES 5 /* number of times to retry passive opening a | 94 | #define TCP_SYNACK_RETRIES 5 /* number of times to retry passive opening a |
95 | * connection: ~180sec is RFC minumum */ | 95 | * connection: ~180sec is RFC minimum */ |
96 | 96 | ||
97 | 97 | ||
98 | #define TCP_ORPHAN_RETRIES 7 /* number of times to retry on an orphaned | 98 | #define TCP_ORPHAN_RETRIES 7 /* number of times to retry on an orphaned |
@@ -180,7 +180,7 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo); | |||
180 | /* Flags in tp->nonagle */ | 180 | /* Flags in tp->nonagle */ |
181 | #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ | 181 | #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ |
182 | #define TCP_NAGLE_CORK 2 /* Socket is corked */ | 182 | #define TCP_NAGLE_CORK 2 /* Socket is corked */ |
183 | #define TCP_NAGLE_PUSH 4 /* Cork is overriden for already queued data */ | 183 | #define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */ |
184 | 184 | ||
185 | extern struct inet_timewait_death_row tcp_death_row; | 185 | extern struct inet_timewait_death_row tcp_death_row; |
186 | 186 | ||
@@ -218,6 +218,7 @@ extern int sysctl_tcp_low_latency; | |||
218 | extern int sysctl_tcp_nometrics_save; | 218 | extern int sysctl_tcp_nometrics_save; |
219 | extern int sysctl_tcp_moderate_rcvbuf; | 219 | extern int sysctl_tcp_moderate_rcvbuf; |
220 | extern int sysctl_tcp_tso_win_divisor; | 220 | extern int sysctl_tcp_tso_win_divisor; |
221 | extern int sysctl_tcp_abc; | ||
221 | 222 | ||
222 | extern atomic_t tcp_memory_allocated; | 223 | extern atomic_t tcp_memory_allocated; |
223 | extern atomic_t tcp_sockets_allocated; | 224 | extern atomic_t tcp_sockets_allocated; |
@@ -551,13 +552,13 @@ extern u32 __tcp_select_window(struct sock *sk); | |||
551 | 552 | ||
552 | /* TCP timestamps are only 32-bits, this causes a slight | 553 | /* TCP timestamps are only 32-bits, this causes a slight |
553 | * complication on 64-bit systems since we store a snapshot | 554 | * complication on 64-bit systems since we store a snapshot |
554 | * of jiffies in the buffer control blocks below. We decidely | 555 | * of jiffies in the buffer control blocks below. We decidedly |
555 | * only use of the low 32-bits of jiffies and hide the ugly | 556 | * only use of the low 32-bits of jiffies and hide the ugly |
556 | * casts with the following macro. | 557 | * casts with the following macro. |
557 | */ | 558 | */ |
558 | #define tcp_time_stamp ((__u32)(jiffies)) | 559 | #define tcp_time_stamp ((__u32)(jiffies)) |
559 | 560 | ||
560 | /* This is what the send packet queueing engine uses to pass | 561 | /* This is what the send packet queuing engine uses to pass |
561 | * TCP per-packet control information to the transmission | 562 | * TCP per-packet control information to the transmission |
562 | * code. We also store the host-order sequence numbers in | 563 | * code. We also store the host-order sequence numbers in |
563 | * here too. This is 36 bytes on 32-bit architectures, | 564 | * here too. This is 36 bytes on 32-bit architectures, |
@@ -597,7 +598,7 @@ struct tcp_skb_cb { | |||
597 | #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ | 598 | #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ |
598 | #define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS) | 599 | #define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS) |
599 | 600 | ||
600 | #define TCPCB_URG 0x20 /* Urgent pointer advenced here */ | 601 | #define TCPCB_URG 0x20 /* Urgent pointer advanced here */ |
601 | 602 | ||
602 | #define TCPCB_AT_TAIL (TCPCB_URG) | 603 | #define TCPCB_AT_TAIL (TCPCB_URG) |
603 | 604 | ||
@@ -765,6 +766,33 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) | |||
765 | (tp->snd_cwnd >> 2))); | 766 | (tp->snd_cwnd >> 2))); |
766 | } | 767 | } |
767 | 768 | ||
769 | /* | ||
770 | * Linear increase during slow start | ||
771 | */ | ||
772 | static inline void tcp_slow_start(struct tcp_sock *tp) | ||
773 | { | ||
774 | if (sysctl_tcp_abc) { | ||
775 | /* RFC3465: Slow Start | ||
776 | * TCP sender SHOULD increase cwnd by the number of | ||
777 | * previously unacknowledged bytes ACKed by each incoming | ||
778 | * acknowledgment, provided the increase is not more than L | ||
779 | */ | ||
780 | if (tp->bytes_acked < tp->mss_cache) | ||
781 | return; | ||
782 | |||
783 | /* We MAY increase by 2 if discovered delayed ack */ | ||
784 | if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) { | ||
785 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
786 | tp->snd_cwnd++; | ||
787 | } | ||
788 | } | ||
789 | tp->bytes_acked = 0; | ||
790 | |||
791 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
792 | tp->snd_cwnd++; | ||
793 | } | ||
794 | |||
795 | |||
768 | static inline void tcp_sync_left_out(struct tcp_sock *tp) | 796 | static inline void tcp_sync_left_out(struct tcp_sock *tp) |
769 | { | 797 | { |
770 | if (tp->rx_opt.sack_ok && | 798 | if (tp->rx_opt.sack_ok && |
@@ -794,6 +822,7 @@ static inline void tcp_enter_cwr(struct sock *sk) | |||
794 | struct tcp_sock *tp = tcp_sk(sk); | 822 | struct tcp_sock *tp = tcp_sk(sk); |
795 | 823 | ||
796 | tp->prior_ssthresh = 0; | 824 | tp->prior_ssthresh = 0; |
825 | tp->bytes_acked = 0; | ||
797 | if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { | 826 | if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { |
798 | __tcp_enter_cwr(sk); | 827 | __tcp_enter_cwr(sk); |
799 | tcp_set_ca_state(sk, TCP_CA_CWR); | 828 | tcp_set_ca_state(sk, TCP_CA_CWR); |
@@ -810,6 +839,27 @@ static __inline__ __u32 tcp_max_burst(const struct tcp_sock *tp) | |||
810 | return 3; | 839 | return 3; |
811 | } | 840 | } |
812 | 841 | ||
842 | /* RFC2861 Check whether we are limited by application or congestion window | ||
843 | * This is the inverse of cwnd check in tcp_tso_should_defer | ||
844 | */ | ||
845 | static inline int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) | ||
846 | { | ||
847 | const struct tcp_sock *tp = tcp_sk(sk); | ||
848 | u32 left; | ||
849 | |||
850 | if (in_flight >= tp->snd_cwnd) | ||
851 | return 1; | ||
852 | |||
853 | if (!(sk->sk_route_caps & NETIF_F_TSO)) | ||
854 | return 0; | ||
855 | |||
856 | left = tp->snd_cwnd - in_flight; | ||
857 | if (sysctl_tcp_tso_win_divisor) | ||
858 | return left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd; | ||
859 | else | ||
860 | return left <= tcp_max_burst(tp); | ||
861 | } | ||
862 | |||
813 | static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, | 863 | static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, |
814 | const struct sk_buff *skb) | 864 | const struct sk_buff *skb) |
815 | { | 865 | { |
@@ -1157,6 +1207,15 @@ static inline void tcp_mib_init(void) | |||
1157 | TCP_ADD_STATS_USER(TCP_MIB_MAXCONN, -1); | 1207 | TCP_ADD_STATS_USER(TCP_MIB_MAXCONN, -1); |
1158 | } | 1208 | } |
1159 | 1209 | ||
1210 | /*from STCP */ | ||
1211 | static inline void clear_all_retrans_hints(struct tcp_sock *tp){ | ||
1212 | tp->lost_skb_hint = NULL; | ||
1213 | tp->scoreboard_skb_hint = NULL; | ||
1214 | tp->retransmit_skb_hint = NULL; | ||
1215 | tp->forward_skb_hint = NULL; | ||
1216 | tp->fastpath_skb_hint = NULL; | ||
1217 | } | ||
1218 | |||
1160 | /* /proc */ | 1219 | /* /proc */ |
1161 | enum tcp_seq_states { | 1220 | enum tcp_seq_states { |
1162 | TCP_SEQ_STATE_LISTENING, | 1221 | TCP_SEQ_STATE_LISTENING, |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 652685623519..01444a02b48b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -645,6 +645,14 @@ ctl_table ipv4_table[] = { | |||
645 | .proc_handler = &proc_tcp_congestion_control, | 645 | .proc_handler = &proc_tcp_congestion_control, |
646 | .strategy = &sysctl_tcp_congestion_control, | 646 | .strategy = &sysctl_tcp_congestion_control, |
647 | }, | 647 | }, |
648 | { | ||
649 | .ctl_name = NET_TCP_ABC, | ||
650 | .procname = "tcp_abc", | ||
651 | .data = &sysctl_tcp_abc, | ||
652 | .maxlen = sizeof(int), | ||
653 | .mode = 0644, | ||
654 | .proc_handler = &proc_dointvec, | ||
655 | }, | ||
648 | 656 | ||
649 | { .ctl_name = 0 } | 657 | { .ctl_name = 0 } |
650 | }; | 658 | }; |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 72b7c22e1ea5..9ac7a4f46bd8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -1640,7 +1640,7 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
1640 | } else if (tcp_need_reset(old_state) || | 1640 | } else if (tcp_need_reset(old_state) || |
1641 | (tp->snd_nxt != tp->write_seq && | 1641 | (tp->snd_nxt != tp->write_seq && |
1642 | (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { | 1642 | (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { |
1643 | /* The last check adjusts for discrepance of Linux wrt. RFC | 1643 | /* The last check adjusts for discrepancy of Linux wrt. RFC |
1644 | * states | 1644 | * states |
1645 | */ | 1645 | */ |
1646 | tcp_send_active_reset(sk, gfp_any()); | 1646 | tcp_send_active_reset(sk, gfp_any()); |
@@ -1669,6 +1669,7 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
1669 | tp->packets_out = 0; | 1669 | tp->packets_out = 0; |
1670 | tp->snd_ssthresh = 0x7fffffff; | 1670 | tp->snd_ssthresh = 0x7fffffff; |
1671 | tp->snd_cwnd_cnt = 0; | 1671 | tp->snd_cwnd_cnt = 0; |
1672 | tp->bytes_acked = 0; | ||
1672 | tcp_set_ca_state(sk, TCP_CA_Open); | 1673 | tcp_set_ca_state(sk, TCP_CA_Open); |
1673 | tcp_clear_retrans(tp); | 1674 | tcp_clear_retrans(tp); |
1674 | inet_csk_delack_init(sk); | 1675 | inet_csk_delack_init(sk); |
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index ae35e0609047..1d0cd86621b1 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c | |||
@@ -217,17 +217,15 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, | |||
217 | 217 | ||
218 | bictcp_low_utilization(sk, data_acked); | 218 | bictcp_low_utilization(sk, data_acked); |
219 | 219 | ||
220 | if (in_flight < tp->snd_cwnd) | 220 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
221 | return; | 221 | return; |
222 | 222 | ||
223 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 223 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
224 | /* In "safe" area, increase. */ | 224 | tcp_slow_start(tp); |
225 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 225 | else { |
226 | tp->snd_cwnd++; | ||
227 | } else { | ||
228 | bictcp_update(ca, tp->snd_cwnd); | 226 | bictcp_update(ca, tp->snd_cwnd); |
229 | 227 | ||
230 | /* In dangerous area, increase slowly. | 228 | /* In dangerous area, increase slowly. |
231 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | 229 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd |
232 | */ | 230 | */ |
233 | if (tp->snd_cwnd_cnt >= ca->cnt) { | 231 | if (tp->snd_cwnd_cnt >= ca->cnt) { |
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index bbf2d6624e89..c7cc62c8dc12 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c | |||
@@ -186,24 +186,32 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, | |||
186 | { | 186 | { |
187 | struct tcp_sock *tp = tcp_sk(sk); | 187 | struct tcp_sock *tp = tcp_sk(sk); |
188 | 188 | ||
189 | if (in_flight < tp->snd_cwnd) | 189 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
190 | return; | 190 | return; |
191 | 191 | ||
192 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 192 | /* In "safe" area, increase. */ |
193 | /* In "safe" area, increase. */ | 193 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
194 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 194 | tcp_slow_start(tp); |
195 | tp->snd_cwnd++; | 195 | |
196 | } else { | 196 | /* In dangerous area, increase slowly. */ |
197 | /* In dangerous area, increase slowly. | 197 | else if (sysctl_tcp_abc) { |
198 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | 198 | /* RFC3465: Apppriate Byte Count |
199 | */ | 199 | * increase once for each full cwnd acked |
200 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | 200 | */ |
201 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 201 | if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { |
202 | tp->snd_cwnd++; | 202 | tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; |
203 | tp->snd_cwnd_cnt = 0; | 203 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) |
204 | } else | 204 | tp->snd_cwnd++; |
205 | tp->snd_cwnd_cnt++; | 205 | } |
206 | } | 206 | } else { |
207 | /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ | ||
208 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | ||
209 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
210 | tp->snd_cwnd++; | ||
211 | tp->snd_cwnd_cnt = 0; | ||
212 | } else | ||
213 | tp->snd_cwnd_cnt++; | ||
214 | } | ||
207 | } | 215 | } |
208 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); | 216 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); |
209 | 217 | ||
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 6acc04bde080..82b3c189bd7d 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c | |||
@@ -111,18 +111,17 @@ static void hstcp_init(struct sock *sk) | |||
111 | } | 111 | } |
112 | 112 | ||
113 | static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, | 113 | static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, |
114 | u32 in_flight, int good) | 114 | u32 in_flight, u32 pkts_acked) |
115 | { | 115 | { |
116 | struct tcp_sock *tp = tcp_sk(sk); | 116 | struct tcp_sock *tp = tcp_sk(sk); |
117 | struct hstcp *ca = inet_csk_ca(sk); | 117 | struct hstcp *ca = inet_csk_ca(sk); |
118 | 118 | ||
119 | if (in_flight < tp->snd_cwnd) | 119 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
120 | return; | 120 | return; |
121 | 121 | ||
122 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 122 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
123 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 123 | tcp_slow_start(tp); |
124 | tp->snd_cwnd++; | 124 | else { |
125 | } else { | ||
126 | /* Update AIMD parameters */ | 125 | /* Update AIMD parameters */ |
127 | if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { | 126 | if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { |
128 | while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && | 127 | while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && |
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index e47b37984e95..3284cfb993e6 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c | |||
@@ -207,14 +207,13 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | |||
207 | struct tcp_sock *tp = tcp_sk(sk); | 207 | struct tcp_sock *tp = tcp_sk(sk); |
208 | struct htcp *ca = inet_csk_ca(sk); | 208 | struct htcp *ca = inet_csk_ca(sk); |
209 | 209 | ||
210 | if (in_flight < tp->snd_cwnd) | 210 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
211 | return; | 211 | return; |
212 | 212 | ||
213 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 213 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
214 | /* In "safe" area, increase. */ | 214 | tcp_slow_start(tp); |
215 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | 215 | else { |
216 | tp->snd_cwnd++; | 216 | |
217 | } else { | ||
218 | measure_rtt(sk); | 217 | measure_rtt(sk); |
219 | 218 | ||
220 | /* keep track of number of round-trip times since last backoff event */ | 219 | /* keep track of number of round-trip times since last backoff event */ |
@@ -224,7 +223,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | |||
224 | htcp_alpha_update(ca); | 223 | htcp_alpha_update(ca); |
225 | } | 224 | } |
226 | 225 | ||
227 | /* In dangerous area, increase slowly. | 226 | /* In dangerous area, increase slowly. |
228 | * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd | 227 | * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd |
229 | */ | 228 | */ |
230 | if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { | 229 | if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { |
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 77add63623df..40dbb3877510 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c | |||
@@ -100,12 +100,12 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | |||
100 | ca->minrtt = tp->srtt; | 100 | ca->minrtt = tp->srtt; |
101 | } | 101 | } |
102 | 102 | ||
103 | if (!tcp_is_cwnd_limited(sk, in_flight)) | ||
104 | return; | ||
105 | |||
103 | if (!ca->hybla_en) | 106 | if (!ca->hybla_en) |
104 | return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); | 107 | return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); |
105 | 108 | ||
106 | if (in_flight < tp->snd_cwnd) | ||
107 | return; | ||
108 | |||
109 | if (ca->rho == 0) | 109 | if (ca->rho == 0) |
110 | hybla_recalc_param(sk); | 110 | hybla_recalc_param(sk); |
111 | 111 | ||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3e98b57578dc..40a26b7157b4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -42,7 +42,7 @@ | |||
42 | * Andi Kleen : Moved open_request checking here | 42 | * Andi Kleen : Moved open_request checking here |
43 | * and process RSTs for open_requests. | 43 | * and process RSTs for open_requests. |
44 | * Andi Kleen : Better prune_queue, and other fixes. | 44 | * Andi Kleen : Better prune_queue, and other fixes. |
45 | * Andrey Savochkin: Fix RTT measurements in the presnce of | 45 | * Andrey Savochkin: Fix RTT measurements in the presence of |
46 | * timestamps. | 46 | * timestamps. |
47 | * Andrey Savochkin: Check sequence numbers correctly when | 47 | * Andrey Savochkin: Check sequence numbers correctly when |
48 | * removing SACKs due to in sequence incoming | 48 | * removing SACKs due to in sequence incoming |
@@ -89,6 +89,7 @@ int sysctl_tcp_frto; | |||
89 | int sysctl_tcp_nometrics_save; | 89 | int sysctl_tcp_nometrics_save; |
90 | 90 | ||
91 | int sysctl_tcp_moderate_rcvbuf = 1; | 91 | int sysctl_tcp_moderate_rcvbuf = 1; |
92 | int sysctl_tcp_abc = 1; | ||
92 | 93 | ||
93 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ | 94 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
94 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | 95 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
@@ -223,7 +224,7 @@ static void tcp_fixup_sndbuf(struct sock *sk) | |||
223 | * of receiver window. Check #2. | 224 | * of receiver window. Check #2. |
224 | * | 225 | * |
225 | * The scheme does not work when sender sends good segments opening | 226 | * The scheme does not work when sender sends good segments opening |
226 | * window and then starts to feed us spagetti. But it should work | 227 | * window and then starts to feed us spaghetti. But it should work |
227 | * in common situations. Otherwise, we have to rely on queue collapsing. | 228 | * in common situations. Otherwise, we have to rely on queue collapsing. |
228 | */ | 229 | */ |
229 | 230 | ||
@@ -233,7 +234,7 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, | |||
233 | { | 234 | { |
234 | /* Optimize this! */ | 235 | /* Optimize this! */ |
235 | int truesize = tcp_win_from_space(skb->truesize)/2; | 236 | int truesize = tcp_win_from_space(skb->truesize)/2; |
236 | int window = tcp_full_space(sk)/2; | 237 | int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; |
237 | 238 | ||
238 | while (tp->rcv_ssthresh <= window) { | 239 | while (tp->rcv_ssthresh <= window) { |
239 | if (truesize <= skb->len) | 240 | if (truesize <= skb->len) |
@@ -277,7 +278,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) | |||
277 | int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); | 278 | int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); |
278 | 279 | ||
279 | /* Try to select rcvbuf so that 4 mss-sized segments | 280 | /* Try to select rcvbuf so that 4 mss-sized segments |
280 | * will fit to window and correspoding skbs will fit to our rcvbuf. | 281 | * will fit to window and corresponding skbs will fit to our rcvbuf. |
281 | * (was 3; 4 is minimum to allow fast retransmit to work.) | 282 | * (was 3; 4 is minimum to allow fast retransmit to work.) |
282 | */ | 283 | */ |
283 | while (tcp_win_from_space(rcvmem) < tp->advmss) | 284 | while (tcp_win_from_space(rcvmem) < tp->advmss) |
@@ -286,7 +287,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk) | |||
286 | sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); | 287 | sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); |
287 | } | 288 | } |
288 | 289 | ||
289 | /* 4. Try to fixup all. It is made iimediately after connection enters | 290 | /* 4. Try to fixup all. It is made immediately after connection enters |
290 | * established state. | 291 | * established state. |
291 | */ | 292 | */ |
292 | static void tcp_init_buffer_space(struct sock *sk) | 293 | static void tcp_init_buffer_space(struct sock *sk) |
@@ -326,37 +327,18 @@ static void tcp_init_buffer_space(struct sock *sk) | |||
326 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) | 327 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) |
327 | { | 328 | { |
328 | struct inet_connection_sock *icsk = inet_csk(sk); | 329 | struct inet_connection_sock *icsk = inet_csk(sk); |
329 | struct sk_buff *skb; | ||
330 | unsigned int app_win = tp->rcv_nxt - tp->copied_seq; | ||
331 | int ofo_win = 0; | ||
332 | 330 | ||
333 | icsk->icsk_ack.quick = 0; | 331 | icsk->icsk_ack.quick = 0; |
334 | 332 | ||
335 | skb_queue_walk(&tp->out_of_order_queue, skb) { | 333 | if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && |
336 | ofo_win += skb->len; | 334 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && |
337 | } | 335 | !tcp_memory_pressure && |
338 | 336 | atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { | |
339 | /* If overcommit is due to out of order segments, | 337 | sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), |
340 | * do not clamp window. Try to expand rcvbuf instead. | 338 | sysctl_tcp_rmem[2]); |
341 | */ | ||
342 | if (ofo_win) { | ||
343 | if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && | ||
344 | !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && | ||
345 | !tcp_memory_pressure && | ||
346 | atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) | ||
347 | sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), | ||
348 | sysctl_tcp_rmem[2]); | ||
349 | } | 339 | } |
350 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { | 340 | if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) |
351 | app_win += ofo_win; | ||
352 | if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) | ||
353 | app_win >>= 1; | ||
354 | if (app_win > icsk->icsk_ack.rcv_mss) | ||
355 | app_win -= icsk->icsk_ack.rcv_mss; | ||
356 | app_win = max(app_win, 2U*tp->advmss); | ||
357 | |||
358 | tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); | 341 | tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); |
359 | } | ||
360 | } | 342 | } |
361 | 343 | ||
362 | /* Receiver "autotuning" code. | 344 | /* Receiver "autotuning" code. |
@@ -385,8 +367,8 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) | |||
385 | * are stalled on filesystem I/O. | 367 | * are stalled on filesystem I/O. |
386 | * | 368 | * |
387 | * Also, since we are only going for a minimum in the | 369 | * Also, since we are only going for a minimum in the |
388 | * non-timestamp case, we do not smoothe things out | 370 | * non-timestamp case, we do not smoother things out |
389 | * else with timestamps disabled convergance takes too | 371 | * else with timestamps disabled convergence takes too |
390 | * long. | 372 | * long. |
391 | */ | 373 | */ |
392 | if (!win_dep) { | 374 | if (!win_dep) { |
@@ -395,7 +377,7 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) | |||
395 | } else if (m < new_sample) | 377 | } else if (m < new_sample) |
396 | new_sample = m << 3; | 378 | new_sample = m << 3; |
397 | } else { | 379 | } else { |
398 | /* No previous mesaure. */ | 380 | /* No previous measure. */ |
399 | new_sample = m << 3; | 381 | new_sample = m << 3; |
400 | } | 382 | } |
401 | 383 | ||
@@ -524,7 +506,7 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
524 | if (icsk->icsk_ack.ato > icsk->icsk_rto) | 506 | if (icsk->icsk_ack.ato > icsk->icsk_rto) |
525 | icsk->icsk_ack.ato = icsk->icsk_rto; | 507 | icsk->icsk_ack.ato = icsk->icsk_rto; |
526 | } else if (m > icsk->icsk_rto) { | 508 | } else if (m > icsk->icsk_rto) { |
527 | /* Too long gap. Apparently sender falled to | 509 | /* Too long gap. Apparently sender failed to |
528 | * restart window, so that we send ACKs quickly. | 510 | * restart window, so that we send ACKs quickly. |
529 | */ | 511 | */ |
530 | tcp_incr_quickack(sk); | 512 | tcp_incr_quickack(sk); |
@@ -548,10 +530,9 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
548 | * To save cycles in the RFC 1323 implementation it was better to break | 530 | * To save cycles in the RFC 1323 implementation it was better to break |
549 | * it up into three procedures. -- erics | 531 | * it up into three procedures. -- erics |
550 | */ | 532 | */ |
551 | static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) | 533 | static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) |
552 | { | 534 | { |
553 | struct tcp_sock *tp = tcp_sk(sk); | 535 | struct tcp_sock *tp = tcp_sk(sk); |
554 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
555 | long m = mrtt; /* RTT */ | 536 | long m = mrtt; /* RTT */ |
556 | 537 | ||
557 | /* The following amusing code comes from Jacobson's | 538 | /* The following amusing code comes from Jacobson's |
@@ -565,7 +546,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) | |||
565 | * | 546 | * |
566 | * Funny. This algorithm seems to be very broken. | 547 | * Funny. This algorithm seems to be very broken. |
567 | * These formulae increase RTO, when it should be decreased, increase | 548 | * These formulae increase RTO, when it should be decreased, increase |
568 | * too slowly, when it should be incresed fastly, decrease too fastly | 549 | * too slowly, when it should be increased fastly, decrease too fastly |
569 | * etc. I guess in BSD RTO takes ONE value, so that it is absolutely | 550 | * etc. I guess in BSD RTO takes ONE value, so that it is absolutely |
570 | * does not matter how to _calculate_ it. Seems, it was trap | 551 | * does not matter how to _calculate_ it. Seems, it was trap |
571 | * that VJ failed to avoid. 8) | 552 | * that VJ failed to avoid. 8) |
@@ -610,9 +591,6 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) | |||
610 | tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); | 591 | tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); |
611 | tp->rtt_seq = tp->snd_nxt; | 592 | tp->rtt_seq = tp->snd_nxt; |
612 | } | 593 | } |
613 | |||
614 | if (icsk->icsk_ca_ops->rtt_sample) | ||
615 | icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); | ||
616 | } | 594 | } |
617 | 595 | ||
618 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 596 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
@@ -629,14 +607,14 @@ static inline void tcp_set_rto(struct sock *sk) | |||
629 | * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ | 607 | * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ |
630 | * to do with delayed acks, because at cwnd>2 true delack timeout | 608 | * to do with delayed acks, because at cwnd>2 true delack timeout |
631 | * is invisible. Actually, Linux-2.4 also generates erratic | 609 | * is invisible. Actually, Linux-2.4 also generates erratic |
632 | * ACKs in some curcumstances. | 610 | * ACKs in some circumstances. |
633 | */ | 611 | */ |
634 | inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; | 612 | inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; |
635 | 613 | ||
636 | /* 2. Fixups made earlier cannot be right. | 614 | /* 2. Fixups made earlier cannot be right. |
637 | * If we do not estimate RTO correctly without them, | 615 | * If we do not estimate RTO correctly without them, |
638 | * all the algo is pure shit and should be replaced | 616 | * all the algo is pure shit and should be replaced |
639 | * with correct one. It is exaclty, which we pretend to do. | 617 | * with correct one. It is exactly, which we pretend to do. |
640 | */ | 618 | */ |
641 | } | 619 | } |
642 | 620 | ||
@@ -794,7 +772,7 @@ static void tcp_init_metrics(struct sock *sk) | |||
794 | * to make it more realistic. | 772 | * to make it more realistic. |
795 | * | 773 | * |
796 | * A bit of theory. RTT is time passed after "normal" sized packet | 774 | * A bit of theory. RTT is time passed after "normal" sized packet |
797 | * is sent until it is ACKed. In normal curcumstances sending small | 775 | * is sent until it is ACKed. In normal circumstances sending small |
798 | * packets force peer to delay ACKs and calculation is correct too. | 776 | * packets force peer to delay ACKs and calculation is correct too. |
799 | * The algorithm is adaptive and, provided we follow specs, it | 777 | * The algorithm is adaptive and, provided we follow specs, it |
800 | * NEVER underestimate RTT. BUT! If peer tries to make some clever | 778 | * NEVER underestimate RTT. BUT! If peer tries to make some clever |
@@ -919,18 +897,32 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
919 | int prior_fackets; | 897 | int prior_fackets; |
920 | u32 lost_retrans = 0; | 898 | u32 lost_retrans = 0; |
921 | int flag = 0; | 899 | int flag = 0; |
900 | int dup_sack = 0; | ||
922 | int i; | 901 | int i; |
923 | 902 | ||
924 | if (!tp->sacked_out) | 903 | if (!tp->sacked_out) |
925 | tp->fackets_out = 0; | 904 | tp->fackets_out = 0; |
926 | prior_fackets = tp->fackets_out; | 905 | prior_fackets = tp->fackets_out; |
927 | 906 | ||
928 | for (i=0; i<num_sacks; i++, sp++) { | 907 | /* SACK fastpath: |
929 | struct sk_buff *skb; | 908 | * if the only SACK change is the increase of the end_seq of |
930 | __u32 start_seq = ntohl(sp->start_seq); | 909 | * the first block then only apply that SACK block |
931 | __u32 end_seq = ntohl(sp->end_seq); | 910 | * and use retrans queue hinting otherwise slowpath */ |
932 | int fack_count = 0; | 911 | flag = 1; |
933 | int dup_sack = 0; | 912 | for (i = 0; i< num_sacks; i++) { |
913 | __u32 start_seq = ntohl(sp[i].start_seq); | ||
914 | __u32 end_seq = ntohl(sp[i].end_seq); | ||
915 | |||
916 | if (i == 0){ | ||
917 | if (tp->recv_sack_cache[i].start_seq != start_seq) | ||
918 | flag = 0; | ||
919 | } else { | ||
920 | if ((tp->recv_sack_cache[i].start_seq != start_seq) || | ||
921 | (tp->recv_sack_cache[i].end_seq != end_seq)) | ||
922 | flag = 0; | ||
923 | } | ||
924 | tp->recv_sack_cache[i].start_seq = start_seq; | ||
925 | tp->recv_sack_cache[i].end_seq = end_seq; | ||
934 | 926 | ||
935 | /* Check for D-SACK. */ | 927 | /* Check for D-SACK. */ |
936 | if (i == 0) { | 928 | if (i == 0) { |
@@ -962,15 +954,58 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
962 | if (before(ack, prior_snd_una - tp->max_window)) | 954 | if (before(ack, prior_snd_una - tp->max_window)) |
963 | return 0; | 955 | return 0; |
964 | } | 956 | } |
957 | } | ||
958 | |||
959 | if (flag) | ||
960 | num_sacks = 1; | ||
961 | else { | ||
962 | int j; | ||
963 | tp->fastpath_skb_hint = NULL; | ||
964 | |||
965 | /* order SACK blocks to allow in order walk of the retrans queue */ | ||
966 | for (i = num_sacks-1; i > 0; i--) { | ||
967 | for (j = 0; j < i; j++){ | ||
968 | if (after(ntohl(sp[j].start_seq), | ||
969 | ntohl(sp[j+1].start_seq))){ | ||
970 | sp[j].start_seq = htonl(tp->recv_sack_cache[j+1].start_seq); | ||
971 | sp[j].end_seq = htonl(tp->recv_sack_cache[j+1].end_seq); | ||
972 | sp[j+1].start_seq = htonl(tp->recv_sack_cache[j].start_seq); | ||
973 | sp[j+1].end_seq = htonl(tp->recv_sack_cache[j].end_seq); | ||
974 | } | ||
975 | |||
976 | } | ||
977 | } | ||
978 | } | ||
979 | |||
980 | /* clear flag as used for different purpose in following code */ | ||
981 | flag = 0; | ||
982 | |||
983 | for (i=0; i<num_sacks; i++, sp++) { | ||
984 | struct sk_buff *skb; | ||
985 | __u32 start_seq = ntohl(sp->start_seq); | ||
986 | __u32 end_seq = ntohl(sp->end_seq); | ||
987 | int fack_count; | ||
988 | |||
989 | /* Use SACK fastpath hint if valid */ | ||
990 | if (tp->fastpath_skb_hint) { | ||
991 | skb = tp->fastpath_skb_hint; | ||
992 | fack_count = tp->fastpath_cnt_hint; | ||
993 | } else { | ||
994 | skb = sk->sk_write_queue.next; | ||
995 | fack_count = 0; | ||
996 | } | ||
965 | 997 | ||
966 | /* Event "B" in the comment above. */ | 998 | /* Event "B" in the comment above. */ |
967 | if (after(end_seq, tp->high_seq)) | 999 | if (after(end_seq, tp->high_seq)) |
968 | flag |= FLAG_DATA_LOST; | 1000 | flag |= FLAG_DATA_LOST; |
969 | 1001 | ||
970 | sk_stream_for_retrans_queue(skb, sk) { | 1002 | sk_stream_for_retrans_queue_from(skb, sk) { |
971 | int in_sack, pcount; | 1003 | int in_sack, pcount; |
972 | u8 sacked; | 1004 | u8 sacked; |
973 | 1005 | ||
1006 | tp->fastpath_skb_hint = skb; | ||
1007 | tp->fastpath_cnt_hint = fack_count; | ||
1008 | |||
974 | /* The retransmission queue is always in order, so | 1009 | /* The retransmission queue is always in order, so |
975 | * we can short-circuit the walk early. | 1010 | * we can short-circuit the walk early. |
976 | */ | 1011 | */ |
@@ -1045,6 +1080,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
1045 | TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); | 1080 | TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); |
1046 | tp->lost_out -= tcp_skb_pcount(skb); | 1081 | tp->lost_out -= tcp_skb_pcount(skb); |
1047 | tp->retrans_out -= tcp_skb_pcount(skb); | 1082 | tp->retrans_out -= tcp_skb_pcount(skb); |
1083 | |||
1084 | /* clear lost hint */ | ||
1085 | tp->retransmit_skb_hint = NULL; | ||
1048 | } | 1086 | } |
1049 | } else { | 1087 | } else { |
1050 | /* New sack for not retransmitted frame, | 1088 | /* New sack for not retransmitted frame, |
@@ -1057,6 +1095,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
1057 | if (sacked & TCPCB_LOST) { | 1095 | if (sacked & TCPCB_LOST) { |
1058 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; | 1096 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; |
1059 | tp->lost_out -= tcp_skb_pcount(skb); | 1097 | tp->lost_out -= tcp_skb_pcount(skb); |
1098 | |||
1099 | /* clear lost hint */ | ||
1100 | tp->retransmit_skb_hint = NULL; | ||
1060 | } | 1101 | } |
1061 | } | 1102 | } |
1062 | 1103 | ||
@@ -1080,6 +1121,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
1080 | (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { | 1121 | (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { |
1081 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | 1122 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
1082 | tp->retrans_out -= tcp_skb_pcount(skb); | 1123 | tp->retrans_out -= tcp_skb_pcount(skb); |
1124 | tp->retransmit_skb_hint = NULL; | ||
1083 | } | 1125 | } |
1084 | } | 1126 | } |
1085 | } | 1127 | } |
@@ -1107,6 +1149,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ | |||
1107 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | 1149 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
1108 | tp->retrans_out -= tcp_skb_pcount(skb); | 1150 | tp->retrans_out -= tcp_skb_pcount(skb); |
1109 | 1151 | ||
1152 | /* clear lost hint */ | ||
1153 | tp->retransmit_skb_hint = NULL; | ||
1154 | |||
1110 | if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { | 1155 | if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { |
1111 | tp->lost_out += tcp_skb_pcount(skb); | 1156 | tp->lost_out += tcp_skb_pcount(skb); |
1112 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 1157 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
@@ -1214,6 +1259,8 @@ static void tcp_enter_frto_loss(struct sock *sk) | |||
1214 | tcp_set_ca_state(sk, TCP_CA_Loss); | 1259 | tcp_set_ca_state(sk, TCP_CA_Loss); |
1215 | tp->high_seq = tp->frto_highmark; | 1260 | tp->high_seq = tp->frto_highmark; |
1216 | TCP_ECN_queue_cwr(tp); | 1261 | TCP_ECN_queue_cwr(tp); |
1262 | |||
1263 | clear_all_retrans_hints(tp); | ||
1217 | } | 1264 | } |
1218 | 1265 | ||
1219 | void tcp_clear_retrans(struct tcp_sock *tp) | 1266 | void tcp_clear_retrans(struct tcp_sock *tp) |
@@ -1251,6 +1298,7 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
1251 | tp->snd_cwnd_cnt = 0; | 1298 | tp->snd_cwnd_cnt = 0; |
1252 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1299 | tp->snd_cwnd_stamp = tcp_time_stamp; |
1253 | 1300 | ||
1301 | tp->bytes_acked = 0; | ||
1254 | tcp_clear_retrans(tp); | 1302 | tcp_clear_retrans(tp); |
1255 | 1303 | ||
1256 | /* Push undo marker, if it was plain RTO and nothing | 1304 | /* Push undo marker, if it was plain RTO and nothing |
@@ -1279,6 +1327,8 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
1279 | tcp_set_ca_state(sk, TCP_CA_Loss); | 1327 | tcp_set_ca_state(sk, TCP_CA_Loss); |
1280 | tp->high_seq = tp->snd_nxt; | 1328 | tp->high_seq = tp->snd_nxt; |
1281 | TCP_ECN_queue_cwr(tp); | 1329 | TCP_ECN_queue_cwr(tp); |
1330 | |||
1331 | clear_all_retrans_hints(tp); | ||
1282 | } | 1332 | } |
1283 | 1333 | ||
1284 | static int tcp_check_sack_reneging(struct sock *sk) | 1334 | static int tcp_check_sack_reneging(struct sock *sk) |
@@ -1503,17 +1553,37 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, | |||
1503 | int packets, u32 high_seq) | 1553 | int packets, u32 high_seq) |
1504 | { | 1554 | { |
1505 | struct sk_buff *skb; | 1555 | struct sk_buff *skb; |
1506 | int cnt = packets; | 1556 | int cnt; |
1507 | 1557 | ||
1508 | BUG_TRAP(cnt <= tp->packets_out); | 1558 | BUG_TRAP(packets <= tp->packets_out); |
1559 | if (tp->lost_skb_hint) { | ||
1560 | skb = tp->lost_skb_hint; | ||
1561 | cnt = tp->lost_cnt_hint; | ||
1562 | } else { | ||
1563 | skb = sk->sk_write_queue.next; | ||
1564 | cnt = 0; | ||
1565 | } | ||
1509 | 1566 | ||
1510 | sk_stream_for_retrans_queue(skb, sk) { | 1567 | sk_stream_for_retrans_queue_from(skb, sk) { |
1511 | cnt -= tcp_skb_pcount(skb); | 1568 | /* TODO: do this better */ |
1512 | if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) | 1569 | /* this is not the most efficient way to do this... */ |
1570 | tp->lost_skb_hint = skb; | ||
1571 | tp->lost_cnt_hint = cnt; | ||
1572 | cnt += tcp_skb_pcount(skb); | ||
1573 | if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq)) | ||
1513 | break; | 1574 | break; |
1514 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { | 1575 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { |
1515 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 1576 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
1516 | tp->lost_out += tcp_skb_pcount(skb); | 1577 | tp->lost_out += tcp_skb_pcount(skb); |
1578 | |||
1579 | /* clear xmit_retransmit_queue hints | ||
1580 | * if this is beyond hint */ | ||
1581 | if(tp->retransmit_skb_hint != NULL && | ||
1582 | before(TCP_SKB_CB(skb)->seq, | ||
1583 | TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) { | ||
1584 | |||
1585 | tp->retransmit_skb_hint = NULL; | ||
1586 | } | ||
1517 | } | 1587 | } |
1518 | } | 1588 | } |
1519 | tcp_sync_left_out(tp); | 1589 | tcp_sync_left_out(tp); |
@@ -1540,13 +1610,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) | |||
1540 | if (tcp_head_timedout(sk, tp)) { | 1610 | if (tcp_head_timedout(sk, tp)) { |
1541 | struct sk_buff *skb; | 1611 | struct sk_buff *skb; |
1542 | 1612 | ||
1543 | sk_stream_for_retrans_queue(skb, sk) { | 1613 | skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint |
1544 | if (tcp_skb_timedout(sk, skb) && | 1614 | : sk->sk_write_queue.next; |
1545 | !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { | 1615 | |
1616 | sk_stream_for_retrans_queue_from(skb, sk) { | ||
1617 | if (!tcp_skb_timedout(sk, skb)) | ||
1618 | break; | ||
1619 | |||
1620 | if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { | ||
1546 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | 1621 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; |
1547 | tp->lost_out += tcp_skb_pcount(skb); | 1622 | tp->lost_out += tcp_skb_pcount(skb); |
1623 | |||
1624 | /* clear xmit_retrans hint */ | ||
1625 | if (tp->retransmit_skb_hint && | ||
1626 | before(TCP_SKB_CB(skb)->seq, | ||
1627 | TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) | ||
1628 | |||
1629 | tp->retransmit_skb_hint = NULL; | ||
1548 | } | 1630 | } |
1549 | } | 1631 | } |
1632 | |||
1633 | tp->scoreboard_skb_hint = skb; | ||
1634 | |||
1550 | tcp_sync_left_out(tp); | 1635 | tcp_sync_left_out(tp); |
1551 | } | 1636 | } |
1552 | } | 1637 | } |
@@ -1626,6 +1711,10 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) | |||
1626 | } | 1711 | } |
1627 | tcp_moderate_cwnd(tp); | 1712 | tcp_moderate_cwnd(tp); |
1628 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1713 | tp->snd_cwnd_stamp = tcp_time_stamp; |
1714 | |||
1715 | /* There is something screwy going on with the retrans hints after | ||
1716 | an undo */ | ||
1717 | clear_all_retrans_hints(tp); | ||
1629 | } | 1718 | } |
1630 | 1719 | ||
1631 | static inline int tcp_may_undo(struct tcp_sock *tp) | 1720 | static inline int tcp_may_undo(struct tcp_sock *tp) |
@@ -1709,6 +1798,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) | |||
1709 | sk_stream_for_retrans_queue(skb, sk) { | 1798 | sk_stream_for_retrans_queue(skb, sk) { |
1710 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; | 1799 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; |
1711 | } | 1800 | } |
1801 | |||
1802 | clear_all_retrans_hints(tp); | ||
1803 | |||
1712 | DBGUNDO(sk, tp, "partial loss"); | 1804 | DBGUNDO(sk, tp, "partial loss"); |
1713 | tp->lost_out = 0; | 1805 | tp->lost_out = 0; |
1714 | tp->left_out = tp->sacked_out; | 1806 | tp->left_out = tp->sacked_out; |
@@ -1908,6 +2000,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
1908 | TCP_ECN_queue_cwr(tp); | 2000 | TCP_ECN_queue_cwr(tp); |
1909 | } | 2001 | } |
1910 | 2002 | ||
2003 | tp->bytes_acked = 0; | ||
1911 | tp->snd_cwnd_cnt = 0; | 2004 | tp->snd_cwnd_cnt = 0; |
1912 | tcp_set_ca_state(sk, TCP_CA_Recovery); | 2005 | tcp_set_ca_state(sk, TCP_CA_Recovery); |
1913 | } | 2006 | } |
@@ -1919,9 +2012,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
1919 | } | 2012 | } |
1920 | 2013 | ||
1921 | /* Read draft-ietf-tcplw-high-performance before mucking | 2014 | /* Read draft-ietf-tcplw-high-performance before mucking |
1922 | * with this code. (Superceeds RFC1323) | 2015 | * with this code. (Supersedes RFC1323) |
1923 | */ | 2016 | */ |
1924 | static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) | 2017 | static void tcp_ack_saw_tstamp(struct sock *sk, int flag) |
1925 | { | 2018 | { |
1926 | /* RTTM Rule: A TSecr value received in a segment is used to | 2019 | /* RTTM Rule: A TSecr value received in a segment is used to |
1927 | * update the averaged RTT measurement only if the segment | 2020 | * update the averaged RTT measurement only if the segment |
@@ -1932,7 +2025,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) | |||
1932 | * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> | 2025 | * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> |
1933 | * | 2026 | * |
1934 | * Changed: reset backoff as soon as we see the first valid sample. | 2027 | * Changed: reset backoff as soon as we see the first valid sample. |
1935 | * If we do not, we get strongly overstimated rto. With timestamps | 2028 | * If we do not, we get strongly overestimated rto. With timestamps |
1936 | * samples are accepted even from very old segments: f.e., when rtt=1 | 2029 | * samples are accepted even from very old segments: f.e., when rtt=1 |
1937 | * increases to 8, we retransmit 5 times and after 8 seconds delayed | 2030 | * increases to 8, we retransmit 5 times and after 8 seconds delayed |
1938 | * answer arrives rto becomes 120 seconds! If at least one of segments | 2031 | * answer arrives rto becomes 120 seconds! If at least one of segments |
@@ -1940,13 +2033,13 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) | |||
1940 | */ | 2033 | */ |
1941 | struct tcp_sock *tp = tcp_sk(sk); | 2034 | struct tcp_sock *tp = tcp_sk(sk); |
1942 | const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | 2035 | const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; |
1943 | tcp_rtt_estimator(sk, seq_rtt, usrtt); | 2036 | tcp_rtt_estimator(sk, seq_rtt); |
1944 | tcp_set_rto(sk); | 2037 | tcp_set_rto(sk); |
1945 | inet_csk(sk)->icsk_backoff = 0; | 2038 | inet_csk(sk)->icsk_backoff = 0; |
1946 | tcp_bound_rto(sk); | 2039 | tcp_bound_rto(sk); |
1947 | } | 2040 | } |
1948 | 2041 | ||
1949 | static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) | 2042 | static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) |
1950 | { | 2043 | { |
1951 | /* We don't have a timestamp. Can only use | 2044 | /* We don't have a timestamp. Can only use |
1952 | * packets that are not retransmitted to determine | 2045 | * packets that are not retransmitted to determine |
@@ -1960,21 +2053,21 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag | |||
1960 | if (flag & FLAG_RETRANS_DATA_ACKED) | 2053 | if (flag & FLAG_RETRANS_DATA_ACKED) |
1961 | return; | 2054 | return; |
1962 | 2055 | ||
1963 | tcp_rtt_estimator(sk, seq_rtt, usrtt); | 2056 | tcp_rtt_estimator(sk, seq_rtt); |
1964 | tcp_set_rto(sk); | 2057 | tcp_set_rto(sk); |
1965 | inet_csk(sk)->icsk_backoff = 0; | 2058 | inet_csk(sk)->icsk_backoff = 0; |
1966 | tcp_bound_rto(sk); | 2059 | tcp_bound_rto(sk); |
1967 | } | 2060 | } |
1968 | 2061 | ||
1969 | static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, | 2062 | static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, |
1970 | const s32 seq_rtt, u32 *usrtt) | 2063 | const s32 seq_rtt) |
1971 | { | 2064 | { |
1972 | const struct tcp_sock *tp = tcp_sk(sk); | 2065 | const struct tcp_sock *tp = tcp_sk(sk); |
1973 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ | 2066 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ |
1974 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) | 2067 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
1975 | tcp_ack_saw_tstamp(sk, usrtt, flag); | 2068 | tcp_ack_saw_tstamp(sk, flag); |
1976 | else if (seq_rtt >= 0) | 2069 | else if (seq_rtt >= 0) |
1977 | tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); | 2070 | tcp_ack_no_tstamp(sk, seq_rtt, flag); |
1978 | } | 2071 | } |
1979 | 2072 | ||
1980 | static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | 2073 | static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, |
@@ -2054,20 +2147,27 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, | |||
2054 | return acked; | 2147 | return acked; |
2055 | } | 2148 | } |
2056 | 2149 | ||
2150 | static inline u32 tcp_usrtt(const struct sk_buff *skb) | ||
2151 | { | ||
2152 | struct timeval tv, now; | ||
2153 | |||
2154 | do_gettimeofday(&now); | ||
2155 | skb_get_timestamp(skb, &tv); | ||
2156 | return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec); | ||
2157 | } | ||
2057 | 2158 | ||
2058 | /* Remove acknowledged frames from the retransmission queue. */ | 2159 | /* Remove acknowledged frames from the retransmission queue. */ |
2059 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) | 2160 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) |
2060 | { | 2161 | { |
2061 | struct tcp_sock *tp = tcp_sk(sk); | 2162 | struct tcp_sock *tp = tcp_sk(sk); |
2163 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
2062 | struct sk_buff *skb; | 2164 | struct sk_buff *skb; |
2063 | __u32 now = tcp_time_stamp; | 2165 | __u32 now = tcp_time_stamp; |
2064 | int acked = 0; | 2166 | int acked = 0; |
2065 | __s32 seq_rtt = -1; | 2167 | __s32 seq_rtt = -1; |
2066 | struct timeval usnow; | ||
2067 | u32 pkts_acked = 0; | 2168 | u32 pkts_acked = 0; |
2068 | 2169 | void (*rtt_sample)(struct sock *sk, u32 usrtt) | |
2069 | if (seq_usrtt) | 2170 | = icsk->icsk_ca_ops->rtt_sample; |
2070 | do_gettimeofday(&usnow); | ||
2071 | 2171 | ||
2072 | while ((skb = skb_peek(&sk->sk_write_queue)) && | 2172 | while ((skb = skb_peek(&sk->sk_write_queue)) && |
2073 | skb != sk->sk_send_head) { | 2173 | skb != sk->sk_send_head) { |
@@ -2107,16 +2207,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt | |||
2107 | tp->retrans_out -= tcp_skb_pcount(skb); | 2207 | tp->retrans_out -= tcp_skb_pcount(skb); |
2108 | acked |= FLAG_RETRANS_DATA_ACKED; | 2208 | acked |= FLAG_RETRANS_DATA_ACKED; |
2109 | seq_rtt = -1; | 2209 | seq_rtt = -1; |
2110 | } else if (seq_rtt < 0) | 2210 | } else if (seq_rtt < 0) { |
2111 | seq_rtt = now - scb->when; | 2211 | seq_rtt = now - scb->when; |
2112 | if (seq_usrtt) { | 2212 | if (rtt_sample) |
2113 | struct timeval tv; | 2213 | (*rtt_sample)(sk, tcp_usrtt(skb)); |
2114 | |||
2115 | skb_get_timestamp(skb, &tv); | ||
2116 | *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000 | ||
2117 | + (usnow.tv_usec - tv.tv_usec); | ||
2118 | } | 2214 | } |
2119 | |||
2120 | if (sacked & TCPCB_SACKED_ACKED) | 2215 | if (sacked & TCPCB_SACKED_ACKED) |
2121 | tp->sacked_out -= tcp_skb_pcount(skb); | 2216 | tp->sacked_out -= tcp_skb_pcount(skb); |
2122 | if (sacked & TCPCB_LOST) | 2217 | if (sacked & TCPCB_LOST) |
@@ -2126,17 +2221,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt | |||
2126 | !before(scb->end_seq, tp->snd_up)) | 2221 | !before(scb->end_seq, tp->snd_up)) |
2127 | tp->urg_mode = 0; | 2222 | tp->urg_mode = 0; |
2128 | } | 2223 | } |
2129 | } else if (seq_rtt < 0) | 2224 | } else if (seq_rtt < 0) { |
2130 | seq_rtt = now - scb->when; | 2225 | seq_rtt = now - scb->when; |
2226 | if (rtt_sample) | ||
2227 | (*rtt_sample)(sk, tcp_usrtt(skb)); | ||
2228 | } | ||
2131 | tcp_dec_pcount_approx(&tp->fackets_out, skb); | 2229 | tcp_dec_pcount_approx(&tp->fackets_out, skb); |
2132 | tcp_packets_out_dec(tp, skb); | 2230 | tcp_packets_out_dec(tp, skb); |
2133 | __skb_unlink(skb, &sk->sk_write_queue); | 2231 | __skb_unlink(skb, &sk->sk_write_queue); |
2134 | sk_stream_free_skb(sk, skb); | 2232 | sk_stream_free_skb(sk, skb); |
2233 | clear_all_retrans_hints(tp); | ||
2135 | } | 2234 | } |
2136 | 2235 | ||
2137 | if (acked&FLAG_ACKED) { | 2236 | if (acked&FLAG_ACKED) { |
2138 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2237 | tcp_ack_update_rtt(sk, acked, seq_rtt); |
2139 | tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); | ||
2140 | tcp_ack_packets_out(sk, tp); | 2238 | tcp_ack_packets_out(sk, tp); |
2141 | 2239 | ||
2142 | if (icsk->icsk_ca_ops->pkts_acked) | 2240 | if (icsk->icsk_ca_ops->pkts_acked) |
@@ -2284,7 +2382,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) | |||
2284 | } | 2382 | } |
2285 | 2383 | ||
2286 | /* F-RTO affects on two new ACKs following RTO. | 2384 | /* F-RTO affects on two new ACKs following RTO. |
2287 | * At latest on third ACK the TCP behavor is back to normal. | 2385 | * At latest on third ACK the TCP behavior is back to normal. |
2288 | */ | 2386 | */ |
2289 | tp->frto_counter = (tp->frto_counter + 1) % 3; | 2387 | tp->frto_counter = (tp->frto_counter + 1) % 3; |
2290 | } | 2388 | } |
@@ -2299,7 +2397,6 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
2299 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 2397 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
2300 | u32 prior_in_flight; | 2398 | u32 prior_in_flight; |
2301 | s32 seq_rtt; | 2399 | s32 seq_rtt; |
2302 | s32 seq_usrtt = 0; | ||
2303 | int prior_packets; | 2400 | int prior_packets; |
2304 | 2401 | ||
2305 | /* If the ack is newer than sent or older than previous acks | 2402 | /* If the ack is newer than sent or older than previous acks |
@@ -2311,6 +2408,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
2311 | if (before(ack, prior_snd_una)) | 2408 | if (before(ack, prior_snd_una)) |
2312 | goto old_ack; | 2409 | goto old_ack; |
2313 | 2410 | ||
2411 | if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR) | ||
2412 | tp->bytes_acked += ack - prior_snd_una; | ||
2413 | |||
2314 | if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { | 2414 | if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { |
2315 | /* Window is constant, pure forward advance. | 2415 | /* Window is constant, pure forward advance. |
2316 | * No more checks are required. | 2416 | * No more checks are required. |
@@ -2352,14 +2452,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
2352 | prior_in_flight = tcp_packets_in_flight(tp); | 2452 | prior_in_flight = tcp_packets_in_flight(tp); |
2353 | 2453 | ||
2354 | /* See if we can take anything off of the retransmit queue. */ | 2454 | /* See if we can take anything off of the retransmit queue. */ |
2355 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt, | 2455 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt); |
2356 | icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL); | ||
2357 | 2456 | ||
2358 | if (tp->frto_counter) | 2457 | if (tp->frto_counter) |
2359 | tcp_process_frto(sk, prior_snd_una); | 2458 | tcp_process_frto(sk, prior_snd_una); |
2360 | 2459 | ||
2361 | if (tcp_ack_is_dubious(sk, flag)) { | 2460 | if (tcp_ack_is_dubious(sk, flag)) { |
2362 | /* Advanve CWND, if state allows this. */ | 2461 | /* Advance CWND, if state allows this. */ |
2363 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) | 2462 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) |
2364 | tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); | 2463 | tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); |
2365 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); | 2464 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); |
@@ -3148,7 +3247,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, | |||
3148 | { | 3247 | { |
3149 | struct sk_buff *skb; | 3248 | struct sk_buff *skb; |
3150 | 3249 | ||
3151 | /* First, check that queue is collapsable and find | 3250 | /* First, check that queue is collapsible and find |
3152 | * the point where collapsing can be useful. */ | 3251 | * the point where collapsing can be useful. */ |
3153 | for (skb = head; skb != tail; ) { | 3252 | for (skb = head; skb != tail; ) { |
3154 | /* No new bits? It is possible on ofo queue. */ | 3253 | /* No new bits? It is possible on ofo queue. */ |
@@ -3456,7 +3555,7 @@ static __inline__ void tcp_ack_snd_check(struct sock *sk) | |||
3456 | 3555 | ||
3457 | /* | 3556 | /* |
3458 | * This routine is only called when we have urgent data | 3557 | * This routine is only called when we have urgent data |
3459 | * signalled. Its the 'slow' part of tcp_urg. It could be | 3558 | * signaled. Its the 'slow' part of tcp_urg. It could be |
3460 | * moved inline now as tcp_urg is only called from one | 3559 | * moved inline now as tcp_urg is only called from one |
3461 | * place. We handle URGent data wrong. We have to - as | 3560 | * place. We handle URGent data wrong. We have to - as |
3462 | * BSD still doesn't use the correction from RFC961. | 3561 | * BSD still doesn't use the correction from RFC961. |
@@ -3501,7 +3600,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) | |||
3501 | * urgent. To do this requires some care. We cannot just ignore | 3600 | * urgent. To do this requires some care. We cannot just ignore |
3502 | * tp->copied_seq since we would read the last urgent byte again | 3601 | * tp->copied_seq since we would read the last urgent byte again |
3503 | * as data, nor can we alter copied_seq until this data arrives | 3602 | * as data, nor can we alter copied_seq until this data arrives |
3504 | * or we break the sematics of SIOCATMARK (and thus sockatmark()) | 3603 | * or we break the semantics of SIOCATMARK (and thus sockatmark()) |
3505 | * | 3604 | * |
3506 | * NOTE. Double Dutch. Rendering to plain English: author of comment | 3605 | * NOTE. Double Dutch. Rendering to plain English: author of comment |
3507 | * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); | 3606 | * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); |
@@ -3646,7 +3745,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
3646 | tp->rx_opt.saw_tstamp = 0; | 3745 | tp->rx_opt.saw_tstamp = 0; |
3647 | 3746 | ||
3648 | /* pred_flags is 0xS?10 << 16 + snd_wnd | 3747 | /* pred_flags is 0xS?10 << 16 + snd_wnd |
3649 | * if header_predition is to be made | 3748 | * if header_prediction is to be made |
3650 | * 'S' will always be tp->tcp_header_len >> 2 | 3749 | * 'S' will always be tp->tcp_header_len >> 2 |
3651 | * '?' will be 0 for the fast path, otherwise pred_flags is 0 to | 3750 | * '?' will be 0 for the fast path, otherwise pred_flags is 0 to |
3652 | * turn it off (when there are holes in the receive | 3751 | * turn it off (when there are holes in the receive |
@@ -4242,7 +4341,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
4242 | */ | 4341 | */ |
4243 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 4342 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
4244 | !tp->srtt) | 4343 | !tp->srtt) |
4245 | tcp_ack_saw_tstamp(sk, NULL, 0); | 4344 | tcp_ack_saw_tstamp(sk, 0); |
4246 | 4345 | ||
4247 | if (tp->rx_opt.tstamp_ok) | 4346 | if (tp->rx_opt.tstamp_ok) |
4248 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 4347 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
@@ -4372,6 +4471,7 @@ discard: | |||
4372 | 4471 | ||
4373 | EXPORT_SYMBOL(sysctl_tcp_ecn); | 4472 | EXPORT_SYMBOL(sysctl_tcp_ecn); |
4374 | EXPORT_SYMBOL(sysctl_tcp_reordering); | 4473 | EXPORT_SYMBOL(sysctl_tcp_reordering); |
4474 | EXPORT_SYMBOL(sysctl_tcp_abc); | ||
4375 | EXPORT_SYMBOL(tcp_parse_options); | 4475 | EXPORT_SYMBOL(tcp_parse_options); |
4376 | EXPORT_SYMBOL(tcp_rcv_established); | 4476 | EXPORT_SYMBOL(tcp_rcv_established); |
4377 | EXPORT_SYMBOL(tcp_rcv_state_process); | 4477 | EXPORT_SYMBOL(tcp_rcv_state_process); |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ac1fcf5b4ebc..4d5021e1929b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -39,7 +39,7 @@ | |||
39 | * request_sock handling and moved | 39 | * request_sock handling and moved |
40 | * most of it into the af independent code. | 40 | * most of it into the af independent code. |
41 | * Added tail drop and some other bugfixes. | 41 | * Added tail drop and some other bugfixes. |
42 | * Added new listen sematics. | 42 | * Added new listen semantics. |
43 | * Mike McLagan : Routing by source | 43 | * Mike McLagan : Routing by source |
44 | * Juan Jose Ciarlante: ip_dynaddr bits | 44 | * Juan Jose Ciarlante: ip_dynaddr bits |
45 | * Andi Kleen: various fixes. | 45 | * Andi Kleen: various fixes. |
@@ -1210,7 +1210,7 @@ int tcp_v4_rcv(struct sk_buff *skb) | |||
1210 | 1210 | ||
1211 | /* An explanation is required here, I think. | 1211 | /* An explanation is required here, I think. |
1212 | * Packet length and doff are validated by header prediction, | 1212 | * Packet length and doff are validated by header prediction, |
1213 | * provided case of th->doff==0 is elimineted. | 1213 | * provided case of th->doff==0 is eliminated. |
1214 | * So, we defer the checks. */ | 1214 | * So, we defer the checks. */ |
1215 | if ((skb->ip_summed != CHECKSUM_UNNECESSARY && | 1215 | if ((skb->ip_summed != CHECKSUM_UNNECESSARY && |
1216 | tcp_v4_checksum_init(skb))) | 1216 | tcp_v4_checksum_init(skb))) |
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b1a63b2c6b4a..1b66a2ac4321 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
@@ -158,7 +158,7 @@ kill_with_rst: | |||
158 | /* I am shamed, but failed to make it more elegant. | 158 | /* I am shamed, but failed to make it more elegant. |
159 | * Yes, it is direct reference to IP, which is impossible | 159 | * Yes, it is direct reference to IP, which is impossible |
160 | * to generalize to IPv6. Taking into account that IPv6 | 160 | * to generalize to IPv6. Taking into account that IPv6 |
161 | * do not undertsnad recycling in any case, it not | 161 | * do not understand recycling in any case, it not |
162 | * a big problem in practice. --ANK */ | 162 | * a big problem in practice. --ANK */ |
163 | if (tw->tw_family == AF_INET && | 163 | if (tw->tw_family == AF_INET && |
164 | tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && | 164 | tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && |
@@ -194,7 +194,7 @@ kill_with_rst: | |||
194 | /* In window segment, it may be only reset or bare ack. */ | 194 | /* In window segment, it may be only reset or bare ack. */ |
195 | 195 | ||
196 | if (th->rst) { | 196 | if (th->rst) { |
197 | /* This is TIME_WAIT assasination, in two flavors. | 197 | /* This is TIME_WAIT assassination, in two flavors. |
198 | * Oh well... nobody has a sufficient solution to this | 198 | * Oh well... nobody has a sufficient solution to this |
199 | * protocol bug yet. | 199 | * protocol bug yet. |
200 | */ | 200 | */ |
@@ -380,6 +380,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
380 | */ | 380 | */ |
381 | newtp->snd_cwnd = 2; | 381 | newtp->snd_cwnd = 2; |
382 | newtp->snd_cwnd_cnt = 0; | 382 | newtp->snd_cwnd_cnt = 0; |
383 | newtp->bytes_acked = 0; | ||
383 | 384 | ||
384 | newtp->frto_counter = 0; | 385 | newtp->frto_counter = 0; |
385 | newtp->frto_highmark = 0; | 386 | newtp->frto_highmark = 0; |
@@ -550,7 +551,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, | |||
550 | 551 | ||
551 | /* RFC793 page 36: "If the connection is in any non-synchronized state ... | 552 | /* RFC793 page 36: "If the connection is in any non-synchronized state ... |
552 | * and the incoming segment acknowledges something not yet | 553 | * and the incoming segment acknowledges something not yet |
553 | * sent (the segment carries an unaccaptable ACK) ... | 554 | * sent (the segment carries an unacceptable ACK) ... |
554 | * a reset is sent." | 555 | * a reset is sent." |
555 | * | 556 | * |
556 | * Invalid ACK: reset will be sent by listening socket | 557 | * Invalid ACK: reset will be sent by listening socket |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b907456a79f4..029c70dfb585 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -436,6 +436,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss | |||
436 | u16 flags; | 436 | u16 flags; |
437 | 437 | ||
438 | BUG_ON(len > skb->len); | 438 | BUG_ON(len > skb->len); |
439 | |||
440 | clear_all_retrans_hints(tp); | ||
439 | nsize = skb_headlen(skb) - len; | 441 | nsize = skb_headlen(skb) - len; |
440 | if (nsize < 0) | 442 | if (nsize < 0) |
441 | nsize = 0; | 443 | nsize = 0; |
@@ -599,7 +601,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) | |||
599 | for TCP options, but includes only bare TCP header. | 601 | for TCP options, but includes only bare TCP header. |
600 | 602 | ||
601 | tp->rx_opt.mss_clamp is mss negotiated at connection setup. | 603 | tp->rx_opt.mss_clamp is mss negotiated at connection setup. |
602 | It is minumum of user_mss and mss received with SYN. | 604 | It is minimum of user_mss and mss received with SYN. |
603 | It also does not include TCP options. | 605 | It also does not include TCP options. |
604 | 606 | ||
605 | tp->pmtu_cookie is last pmtu, seen by this function. | 607 | tp->pmtu_cookie is last pmtu, seen by this function. |
@@ -1171,7 +1173,7 @@ u32 __tcp_select_window(struct sock *sk) | |||
1171 | { | 1173 | { |
1172 | struct inet_connection_sock *icsk = inet_csk(sk); | 1174 | struct inet_connection_sock *icsk = inet_csk(sk); |
1173 | struct tcp_sock *tp = tcp_sk(sk); | 1175 | struct tcp_sock *tp = tcp_sk(sk); |
1174 | /* MSS for the peer's data. Previous verions used mss_clamp | 1176 | /* MSS for the peer's data. Previous versions used mss_clamp |
1175 | * here. I don't know if the value based on our guesses | 1177 | * here. I don't know if the value based on our guesses |
1176 | * of peer's MSS is better for the performance. It's more correct | 1178 | * of peer's MSS is better for the performance. It's more correct |
1177 | * but may be worse for the performance because of rcv_mss | 1179 | * but may be worse for the performance because of rcv_mss |
@@ -1260,7 +1262,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m | |||
1260 | BUG_ON(tcp_skb_pcount(skb) != 1 || | 1262 | BUG_ON(tcp_skb_pcount(skb) != 1 || |
1261 | tcp_skb_pcount(next_skb) != 1); | 1263 | tcp_skb_pcount(next_skb) != 1); |
1262 | 1264 | ||
1263 | /* Ok. We will be able to collapse the packet. */ | 1265 | /* changing transmit queue under us so clear hints */ |
1266 | clear_all_retrans_hints(tp); | ||
1267 | |||
1268 | /* Ok. We will be able to collapse the packet. */ | ||
1264 | __skb_unlink(next_skb, &sk->sk_write_queue); | 1269 | __skb_unlink(next_skb, &sk->sk_write_queue); |
1265 | 1270 | ||
1266 | memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); | 1271 | memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); |
@@ -1330,6 +1335,8 @@ void tcp_simple_retransmit(struct sock *sk) | |||
1330 | } | 1335 | } |
1331 | } | 1336 | } |
1332 | 1337 | ||
1338 | clear_all_retrans_hints(tp); | ||
1339 | |||
1333 | if (!lost) | 1340 | if (!lost) |
1334 | return; | 1341 | return; |
1335 | 1342 | ||
@@ -1361,7 +1368,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
1361 | int err; | 1368 | int err; |
1362 | 1369 | ||
1363 | /* Do not sent more than we queued. 1/4 is reserved for possible | 1370 | /* Do not sent more than we queued. 1/4 is reserved for possible |
1364 | * copying overhead: frgagmentation, tunneling, mangling etc. | 1371 | * copying overhead: fragmentation, tunneling, mangling etc. |
1365 | */ | 1372 | */ |
1366 | if (atomic_read(&sk->sk_wmem_alloc) > | 1373 | if (atomic_read(&sk->sk_wmem_alloc) > |
1367 | min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) | 1374 | min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) |
@@ -1468,13 +1475,25 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
1468 | const struct inet_connection_sock *icsk = inet_csk(sk); | 1475 | const struct inet_connection_sock *icsk = inet_csk(sk); |
1469 | struct tcp_sock *tp = tcp_sk(sk); | 1476 | struct tcp_sock *tp = tcp_sk(sk); |
1470 | struct sk_buff *skb; | 1477 | struct sk_buff *skb; |
1471 | int packet_cnt = tp->lost_out; | 1478 | int packet_cnt; |
1479 | |||
1480 | if (tp->retransmit_skb_hint) { | ||
1481 | skb = tp->retransmit_skb_hint; | ||
1482 | packet_cnt = tp->retransmit_cnt_hint; | ||
1483 | }else{ | ||
1484 | skb = sk->sk_write_queue.next; | ||
1485 | packet_cnt = 0; | ||
1486 | } | ||
1472 | 1487 | ||
1473 | /* First pass: retransmit lost packets. */ | 1488 | /* First pass: retransmit lost packets. */ |
1474 | if (packet_cnt) { | 1489 | if (tp->lost_out) { |
1475 | sk_stream_for_retrans_queue(skb, sk) { | 1490 | sk_stream_for_retrans_queue_from(skb, sk) { |
1476 | __u8 sacked = TCP_SKB_CB(skb)->sacked; | 1491 | __u8 sacked = TCP_SKB_CB(skb)->sacked; |
1477 | 1492 | ||
1493 | /* we could do better than to assign each time */ | ||
1494 | tp->retransmit_skb_hint = skb; | ||
1495 | tp->retransmit_cnt_hint = packet_cnt; | ||
1496 | |||
1478 | /* Assume this retransmit will generate | 1497 | /* Assume this retransmit will generate |
1479 | * only one packet for congestion window | 1498 | * only one packet for congestion window |
1480 | * calculation purposes. This works because | 1499 | * calculation purposes. This works because |
@@ -1485,10 +1504,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
1485 | if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) | 1504 | if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) |
1486 | return; | 1505 | return; |
1487 | 1506 | ||
1488 | if (sacked&TCPCB_LOST) { | 1507 | if (sacked & TCPCB_LOST) { |
1489 | if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { | 1508 | if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { |
1490 | if (tcp_retransmit_skb(sk, skb)) | 1509 | if (tcp_retransmit_skb(sk, skb)) { |
1510 | tp->retransmit_skb_hint = NULL; | ||
1491 | return; | 1511 | return; |
1512 | } | ||
1492 | if (icsk->icsk_ca_state != TCP_CA_Loss) | 1513 | if (icsk->icsk_ca_state != TCP_CA_Loss) |
1493 | NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); | 1514 | NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); |
1494 | else | 1515 | else |
@@ -1501,8 +1522,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
1501 | TCP_RTO_MAX); | 1522 | TCP_RTO_MAX); |
1502 | } | 1523 | } |
1503 | 1524 | ||
1504 | packet_cnt -= tcp_skb_pcount(skb); | 1525 | packet_cnt += tcp_skb_pcount(skb); |
1505 | if (packet_cnt <= 0) | 1526 | if (packet_cnt >= tp->lost_out) |
1506 | break; | 1527 | break; |
1507 | } | 1528 | } |
1508 | } | 1529 | } |
@@ -1528,9 +1549,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
1528 | if (tcp_may_send_now(sk, tp)) | 1549 | if (tcp_may_send_now(sk, tp)) |
1529 | return; | 1550 | return; |
1530 | 1551 | ||
1531 | packet_cnt = 0; | 1552 | if (tp->forward_skb_hint) { |
1553 | skb = tp->forward_skb_hint; | ||
1554 | packet_cnt = tp->forward_cnt_hint; | ||
1555 | } else{ | ||
1556 | skb = sk->sk_write_queue.next; | ||
1557 | packet_cnt = 0; | ||
1558 | } | ||
1559 | |||
1560 | sk_stream_for_retrans_queue_from(skb, sk) { | ||
1561 | tp->forward_cnt_hint = packet_cnt; | ||
1562 | tp->forward_skb_hint = skb; | ||
1532 | 1563 | ||
1533 | sk_stream_for_retrans_queue(skb, sk) { | ||
1534 | /* Similar to the retransmit loop above we | 1564 | /* Similar to the retransmit loop above we |
1535 | * can pretend that the retransmitted SKB | 1565 | * can pretend that the retransmitted SKB |
1536 | * we send out here will be composed of one | 1566 | * we send out here will be composed of one |
@@ -1547,8 +1577,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
1547 | continue; | 1577 | continue; |
1548 | 1578 | ||
1549 | /* Ok, retransmit it. */ | 1579 | /* Ok, retransmit it. */ |
1550 | if (tcp_retransmit_skb(sk, skb)) | 1580 | if (tcp_retransmit_skb(sk, skb)) { |
1581 | tp->forward_skb_hint = NULL; | ||
1551 | break; | 1582 | break; |
1583 | } | ||
1552 | 1584 | ||
1553 | if (skb == skb_peek(&sk->sk_write_queue)) | 1585 | if (skb == skb_peek(&sk->sk_write_queue)) |
1554 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 1586 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
@@ -2058,3 +2090,4 @@ EXPORT_SYMBOL(tcp_connect); | |||
2058 | EXPORT_SYMBOL(tcp_make_synack); | 2090 | EXPORT_SYMBOL(tcp_make_synack); |
2059 | EXPORT_SYMBOL(tcp_simple_retransmit); | 2091 | EXPORT_SYMBOL(tcp_simple_retransmit); |
2060 | EXPORT_SYMBOL(tcp_sync_mss); | 2092 | EXPORT_SYMBOL(tcp_sync_mss); |
2093 | EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); | ||
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 327770bf5522..26d7486ee501 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c | |||
@@ -20,20 +20,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | |||
20 | u32 in_flight, int flag) | 20 | u32 in_flight, int flag) |
21 | { | 21 | { |
22 | struct tcp_sock *tp = tcp_sk(sk); | 22 | struct tcp_sock *tp = tcp_sk(sk); |
23 | if (in_flight < tp->snd_cwnd) | 23 | |
24 | if (!tcp_is_cwnd_limited(sk, in_flight)) | ||
24 | return; | 25 | return; |
25 | 26 | ||
26 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 27 | if (tp->snd_cwnd <= tp->snd_ssthresh) |
27 | tp->snd_cwnd++; | 28 | tcp_slow_start(tp); |
28 | } else { | 29 | else { |
29 | tp->snd_cwnd_cnt++; | 30 | tp->snd_cwnd_cnt++; |
30 | if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ | 31 | if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ |
31 | tp->snd_cwnd++; | 32 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) |
33 | tp->snd_cwnd++; | ||
32 | tp->snd_cwnd_cnt = 0; | 34 | tp->snd_cwnd_cnt = 0; |
33 | } | 35 | } |
34 | } | 36 | } |
35 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
36 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
37 | } | 37 | } |
38 | 38 | ||
39 | static u32 tcp_scalable_ssthresh(struct sock *sk) | 39 | static u32 tcp_scalable_ssthresh(struct sock *sk) |
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 415ee47ac1c5..e1880959614a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c | |||
@@ -58,7 +58,7 @@ static void tcp_write_err(struct sock *sk) | |||
58 | * to prevent DoS attacks. It is called when a retransmission timeout | 58 | * to prevent DoS attacks. It is called when a retransmission timeout |
59 | * or zero probe timeout occurs on orphaned socket. | 59 | * or zero probe timeout occurs on orphaned socket. |
60 | * | 60 | * |
61 | * Criterium is still not confirmed experimentally and may change. | 61 | * Criteria is still not confirmed experimentally and may change. |
62 | * We kill the socket, if: | 62 | * We kill the socket, if: |
63 | * 1. If number of orphaned sockets exceeds an administratively configured | 63 | * 1. If number of orphaned sockets exceeds an administratively configured |
64 | * limit. | 64 | * limit. |
@@ -132,7 +132,7 @@ static int tcp_write_timeout(struct sock *sk) | |||
132 | hole detection. :-( | 132 | hole detection. :-( |
133 | 133 | ||
134 | It is place to make it. It is not made. I do not want | 134 | It is place to make it. It is not made. I do not want |
135 | to make it. It is disguisting. It does not work in any | 135 | to make it. It is disgusting. It does not work in any |
136 | case. Let me to cite the same draft, which requires for | 136 | case. Let me to cite the same draft, which requires for |
137 | us to implement this: | 137 | us to implement this: |
138 | 138 | ||
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 93c5f92070f9..4376814d29fb 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c | |||
@@ -236,8 +236,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, | |||
236 | /* We don't have enough RTT samples to do the Vegas | 236 | /* We don't have enough RTT samples to do the Vegas |
237 | * calculation, so we'll behave like Reno. | 237 | * calculation, so we'll behave like Reno. |
238 | */ | 238 | */ |
239 | if (tp->snd_cwnd > tp->snd_ssthresh) | 239 | tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, cnt); |
240 | tp->snd_cwnd++; | ||
241 | } else { | 240 | } else { |
242 | u32 rtt, target_cwnd, diff; | 241 | u32 rtt, target_cwnd, diff; |
243 | 242 | ||
@@ -275,7 +274,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, | |||
275 | */ | 274 | */ |
276 | diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; | 275 | diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; |
277 | 276 | ||
278 | if (tp->snd_cwnd < tp->snd_ssthresh) { | 277 | if (tp->snd_cwnd <= tp->snd_ssthresh) { |
279 | /* Slow start. */ | 278 | /* Slow start. */ |
280 | if (diff > gamma) { | 279 | if (diff > gamma) { |
281 | /* Going too fast. Time to slow down | 280 | /* Going too fast. Time to slow down |
@@ -295,6 +294,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, | |||
295 | V_PARAM_SHIFT)+1); | 294 | V_PARAM_SHIFT)+1); |
296 | 295 | ||
297 | } | 296 | } |
297 | tcp_slow_start(tp); | ||
298 | } else { | 298 | } else { |
299 | /* Congestion avoidance. */ | 299 | /* Congestion avoidance. */ |
300 | u32 next_snd_cwnd; | 300 | u32 next_snd_cwnd; |
@@ -327,37 +327,17 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, | |||
327 | else if (next_snd_cwnd < tp->snd_cwnd) | 327 | else if (next_snd_cwnd < tp->snd_cwnd) |
328 | tp->snd_cwnd--; | 328 | tp->snd_cwnd--; |
329 | } | 329 | } |
330 | } | ||
331 | 330 | ||
332 | /* Wipe the slate clean for the next RTT. */ | 331 | if (tp->snd_cwnd < 2) |
333 | vegas->cntRTT = 0; | 332 | tp->snd_cwnd = 2; |
334 | vegas->minRTT = 0x7fffffff; | 333 | else if (tp->snd_cwnd > tp->snd_cwnd_clamp) |
334 | tp->snd_cwnd = tp->snd_cwnd_clamp; | ||
335 | } | ||
335 | } | 336 | } |
336 | 337 | ||
337 | /* The following code is executed for every ack we receive, | 338 | /* Wipe the slate clean for the next RTT. */ |
338 | * except for conditions checked in should_advance_cwnd() | 339 | vegas->cntRTT = 0; |
339 | * before the call to tcp_cong_avoid(). Mainly this means that | 340 | vegas->minRTT = 0x7fffffff; |
340 | * we only execute this code if the ack actually acked some | ||
341 | * data. | ||
342 | */ | ||
343 | |||
344 | /* If we are in slow start, increase our cwnd in response to this ACK. | ||
345 | * (If we are not in slow start then we are in congestion avoidance, | ||
346 | * and adjust our congestion window only once per RTT. See the code | ||
347 | * above.) | ||
348 | */ | ||
349 | if (tp->snd_cwnd <= tp->snd_ssthresh) | ||
350 | tp->snd_cwnd++; | ||
351 | |||
352 | /* to keep cwnd from growing without bound */ | ||
353 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
354 | |||
355 | /* Make sure that we are never so timid as to reduce our cwnd below | ||
356 | * 2 MSS. | ||
357 | * | ||
358 | * Going below 2 MSS would risk huge delayed ACKs from our receiver. | ||
359 | */ | ||
360 | tp->snd_cwnd = max(tp->snd_cwnd, 2U); | ||
361 | } | 341 | } |
362 | 342 | ||
363 | /* Extract info for Tcp socket info provided via netlink. */ | 343 | /* Extract info for Tcp socket info provided via netlink. */ |