diff options
| author | Stephen Hemminger <shemminger@osdl.org> | 2005-06-23 15:19:55 -0400 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2005-06-23 15:19:55 -0400 |
| commit | 317a76f9a44b437d6301718f4e5d08bd93f98da7 (patch) | |
| tree | caeba9839dee264f59b035b81c3d13d6c61b638e | |
| parent | a8ad86f2dc46356f87be1327dabc18bdbda32f50 (diff) | |
[TCP]: Add pluggable congestion control algorithm infrastructure.
Allow TCP to have multiple pluggable congestion control algorithms.
Algorithms are defined by a set of operations and can be built in
or modules. The legacy "new RENO" algorithm is used as a starting
point and fallback.
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
| -rw-r--r-- | include/linux/sysctl.h | 9 | ||||
| -rw-r--r-- | include/linux/tcp.h | 49 | ||||
| -rw-r--r-- | include/net/tcp.h | 237 | ||||
| -rw-r--r-- | net/ipv4/Makefile | 3 | ||||
| -rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 114 | ||||
| -rw-r--r-- | net/ipv4/tcp.c | 2 | ||||
| -rw-r--r-- | net/ipv4/tcp_cong.c | 195 | ||||
| -rw-r--r-- | net/ipv4/tcp_diag.c | 20 | ||||
| -rw-r--r-- | net/ipv4/tcp_input.c | 737 | ||||
| -rw-r--r-- | net/ipv4/tcp_ipv4.c | 3 | ||||
| -rw-r--r-- | net/ipv4/tcp_minisocks.c | 4 | ||||
| -rw-r--r-- | net/ipv4/tcp_output.c | 23 | ||||
| -rw-r--r-- | net/ipv6/tcp_ipv6.c | 2 |
13 files changed, 399 insertions, 999 deletions
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 614e939c78a4..72965bfe6cfb 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h | |||
| @@ -333,21 +333,14 @@ enum | |||
| 333 | NET_TCP_FRTO=92, | 333 | NET_TCP_FRTO=92, |
| 334 | NET_TCP_LOW_LATENCY=93, | 334 | NET_TCP_LOW_LATENCY=93, |
| 335 | NET_IPV4_IPFRAG_SECRET_INTERVAL=94, | 335 | NET_IPV4_IPFRAG_SECRET_INTERVAL=94, |
| 336 | NET_TCP_WESTWOOD=95, | ||
| 337 | NET_IPV4_IGMP_MAX_MSF=96, | 336 | NET_IPV4_IGMP_MAX_MSF=96, |
| 338 | NET_TCP_NO_METRICS_SAVE=97, | 337 | NET_TCP_NO_METRICS_SAVE=97, |
| 339 | NET_TCP_VEGAS=98, | ||
| 340 | NET_TCP_VEGAS_ALPHA=99, | ||
| 341 | NET_TCP_VEGAS_BETA=100, | ||
| 342 | NET_TCP_VEGAS_GAMMA=101, | ||
| 343 | NET_TCP_BIC=102, | ||
| 344 | NET_TCP_BIC_FAST_CONVERGENCE=103, | ||
| 345 | NET_TCP_BIC_LOW_WINDOW=104, | ||
| 346 | NET_TCP_DEFAULT_WIN_SCALE=105, | 338 | NET_TCP_DEFAULT_WIN_SCALE=105, |
| 347 | NET_TCP_MODERATE_RCVBUF=106, | 339 | NET_TCP_MODERATE_RCVBUF=106, |
| 348 | NET_TCP_TSO_WIN_DIVISOR=107, | 340 | NET_TCP_TSO_WIN_DIVISOR=107, |
| 349 | NET_TCP_BIC_BETA=108, | 341 | NET_TCP_BIC_BETA=108, |
| 350 | NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, | 342 | NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, |
| 343 | NET_TCP_CONG_CONTROL=110, | ||
| 351 | }; | 344 | }; |
| 352 | 345 | ||
| 353 | enum { | 346 | enum { |
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 97a7c9e03df5..3ea75dd6640a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h | |||
| @@ -203,13 +203,6 @@ struct tcp_sack_block { | |||
| 203 | __u32 end_seq; | 203 | __u32 end_seq; |
| 204 | }; | 204 | }; |
| 205 | 205 | ||
| 206 | enum tcp_congestion_algo { | ||
| 207 | TCP_RENO=0, | ||
| 208 | TCP_VEGAS, | ||
| 209 | TCP_WESTWOOD, | ||
| 210 | TCP_BIC, | ||
| 211 | }; | ||
| 212 | |||
| 213 | struct tcp_options_received { | 206 | struct tcp_options_received { |
| 214 | /* PAWS/RTTM data */ | 207 | /* PAWS/RTTM data */ |
| 215 | long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ | 208 | long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ |
| @@ -305,7 +298,7 @@ struct tcp_sock { | |||
| 305 | __u8 reordering; /* Packet reordering metric. */ | 298 | __u8 reordering; /* Packet reordering metric. */ |
| 306 | __u8 frto_counter; /* Number of new acks after RTO */ | 299 | __u8 frto_counter; /* Number of new acks after RTO */ |
| 307 | 300 | ||
| 308 | __u8 adv_cong; /* Using Vegas, Westwood, or BIC */ | 301 | __u8 unused; |
| 309 | __u8 defer_accept; /* User waits for some data after accept() */ | 302 | __u8 defer_accept; /* User waits for some data after accept() */ |
| 310 | 303 | ||
| 311 | /* RTT measurement */ | 304 | /* RTT measurement */ |
| @@ -401,37 +394,10 @@ struct tcp_sock { | |||
| 401 | __u32 time; | 394 | __u32 time; |
| 402 | } rcvq_space; | 395 | } rcvq_space; |
| 403 | 396 | ||
| 404 | /* TCP Westwood structure */ | 397 | /* Pluggable TCP congestion control hook */ |
| 405 | struct { | 398 | struct tcp_congestion_ops *ca_ops; |
| 406 | __u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ | 399 | u32 ca_priv[16]; |
| 407 | __u32 bw_est; /* bandwidth estimate */ | 400 | #define TCP_CA_PRIV_SIZE (16*sizeof(u32)) |
| 408 | __u32 rtt_win_sx; /* here starts a new evaluation... */ | ||
| 409 | __u32 bk; | ||
| 410 | __u32 snd_una; /* used for evaluating the number of acked bytes */ | ||
| 411 | __u32 cumul_ack; | ||
| 412 | __u32 accounted; | ||
| 413 | __u32 rtt; | ||
| 414 | __u32 rtt_min; /* minimum observed RTT */ | ||
| 415 | } westwood; | ||
| 416 | |||
| 417 | /* Vegas variables */ | ||
| 418 | struct { | ||
| 419 | __u32 beg_snd_nxt; /* right edge during last RTT */ | ||
| 420 | __u32 beg_snd_una; /* left edge during last RTT */ | ||
| 421 | __u32 beg_snd_cwnd; /* saves the size of the cwnd */ | ||
| 422 | __u8 doing_vegas_now;/* if true, do vegas for this RTT */ | ||
| 423 | __u16 cntRTT; /* # of RTTs measured within last RTT */ | ||
| 424 | __u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ | ||
| 425 | __u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ | ||
| 426 | } vegas; | ||
| 427 | |||
| 428 | /* BI TCP Parameters */ | ||
| 429 | struct { | ||
| 430 | __u32 cnt; /* increase cwnd by 1 after this number of ACKs */ | ||
| 431 | __u32 last_max_cwnd; /* last maximium snd_cwnd */ | ||
| 432 | __u32 last_cwnd; /* the last snd_cwnd */ | ||
| 433 | __u32 last_stamp; /* time when updated last_cwnd */ | ||
| 434 | } bictcp; | ||
| 435 | }; | 401 | }; |
| 436 | 402 | ||
| 437 | static inline struct tcp_sock *tcp_sk(const struct sock *sk) | 403 | static inline struct tcp_sock *tcp_sk(const struct sock *sk) |
| @@ -439,6 +405,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk) | |||
| 439 | return (struct tcp_sock *)sk; | 405 | return (struct tcp_sock *)sk; |
| 440 | } | 406 | } |
| 441 | 407 | ||
| 408 | static inline void *tcp_ca(const struct tcp_sock *tp) | ||
| 409 | { | ||
| 410 | return (void *) tp->ca_priv; | ||
| 411 | } | ||
| 412 | |||
| 442 | #endif | 413 | #endif |
| 443 | 414 | ||
| 444 | #endif /* _LINUX_TCP_H */ | 415 | #endif /* _LINUX_TCP_H */ |
diff --git a/include/net/tcp.h b/include/net/tcp.h index f730935b824a..e427cf35915c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
| @@ -505,25 +505,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) | |||
| 505 | #else | 505 | #else |
| 506 | # define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG) | 506 | # define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG) |
| 507 | #endif | 507 | #endif |
| 508 | |||
| 509 | #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation | ||
| 510 | * max_cwnd = snd_cwnd * beta | ||
| 511 | */ | ||
| 512 | #define BICTCP_MAX_INCREMENT 32 /* | ||
| 513 | * Limit on the amount of | ||
| 514 | * increment allowed during | ||
| 515 | * binary search. | ||
| 516 | */ | ||
| 517 | #define BICTCP_FUNC_OF_MIN_INCR 11 /* | ||
| 518 | * log(B/Smin)/log(B/(B-1))+1, | ||
| 519 | * Smin:min increment | ||
| 520 | * B:log factor | ||
| 521 | */ | ||
| 522 | #define BICTCP_B 4 /* | ||
| 523 | * In binary search, | ||
| 524 | * go to point (max+min)/N | ||
| 525 | */ | ||
| 526 | |||
| 527 | /* | 508 | /* |
| 528 | * TCP option | 509 | * TCP option |
| 529 | */ | 510 | */ |
| @@ -596,16 +577,7 @@ extern int sysctl_tcp_adv_win_scale; | |||
| 596 | extern int sysctl_tcp_tw_reuse; | 577 | extern int sysctl_tcp_tw_reuse; |
| 597 | extern int sysctl_tcp_frto; | 578 | extern int sysctl_tcp_frto; |
| 598 | extern int sysctl_tcp_low_latency; | 579 | extern int sysctl_tcp_low_latency; |
| 599 | extern int sysctl_tcp_westwood; | ||
| 600 | extern int sysctl_tcp_vegas_cong_avoid; | ||
| 601 | extern int sysctl_tcp_vegas_alpha; | ||
| 602 | extern int sysctl_tcp_vegas_beta; | ||
| 603 | extern int sysctl_tcp_vegas_gamma; | ||
| 604 | extern int sysctl_tcp_nometrics_save; | 580 | extern int sysctl_tcp_nometrics_save; |
| 605 | extern int sysctl_tcp_bic; | ||
| 606 | extern int sysctl_tcp_bic_fast_convergence; | ||
| 607 | extern int sysctl_tcp_bic_low_window; | ||
| 608 | extern int sysctl_tcp_bic_beta; | ||
| 609 | extern int sysctl_tcp_moderate_rcvbuf; | 581 | extern int sysctl_tcp_moderate_rcvbuf; |
| 610 | extern int sysctl_tcp_tso_win_divisor; | 582 | extern int sysctl_tcp_tso_win_divisor; |
| 611 | 583 | ||
| @@ -1136,6 +1108,80 @@ static inline void tcp_packets_out_dec(struct tcp_sock *tp, | |||
| 1136 | tp->packets_out -= tcp_skb_pcount(skb); | 1108 | tp->packets_out -= tcp_skb_pcount(skb); |
| 1137 | } | 1109 | } |
| 1138 | 1110 | ||
| 1111 | /* Events passed to congestion control interface */ | ||
| 1112 | enum tcp_ca_event { | ||
| 1113 | CA_EVENT_TX_START, /* first transmit when no packets in flight */ | ||
| 1114 | CA_EVENT_CWND_RESTART, /* congestion window restart */ | ||
| 1115 | CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ | ||
| 1116 | CA_EVENT_FRTO, /* fast recovery timeout */ | ||
| 1117 | CA_EVENT_LOSS, /* loss timeout */ | ||
| 1118 | CA_EVENT_FAST_ACK, /* in sequence ack */ | ||
| 1119 | CA_EVENT_SLOW_ACK, /* other ack */ | ||
| 1120 | }; | ||
| 1121 | |||
| 1122 | /* | ||
| 1123 | * Interface for adding new TCP congestion control handlers | ||
| 1124 | */ | ||
| 1125 | #define TCP_CA_NAME_MAX 16 | ||
| 1126 | struct tcp_congestion_ops { | ||
| 1127 | struct list_head list; | ||
| 1128 | |||
| 1129 | /* initialize private data (optional) */ | ||
| 1130 | void (*init)(struct tcp_sock *tp); | ||
| 1131 | /* cleanup private data (optional) */ | ||
| 1132 | void (*release)(struct tcp_sock *tp); | ||
| 1133 | |||
| 1134 | /* return slow start threshold (required) */ | ||
| 1135 | u32 (*ssthresh)(struct tcp_sock *tp); | ||
| 1136 | /* lower bound for congestion window (optional) */ | ||
| 1137 | u32 (*min_cwnd)(struct tcp_sock *tp); | ||
| 1138 | /* do new cwnd calculation (required) */ | ||
| 1139 | void (*cong_avoid)(struct tcp_sock *tp, u32 ack, | ||
| 1140 | u32 rtt, u32 in_flight, int good_ack); | ||
| 1141 | /* round trip time sample per acked packet (optional) */ | ||
| 1142 | void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt); | ||
| 1143 | /* call before changing ca_state (optional) */ | ||
| 1144 | void (*set_state)(struct tcp_sock *tp, u8 new_state); | ||
| 1145 | /* call when cwnd event occurs (optional) */ | ||
| 1146 | void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev); | ||
| 1147 | /* new value of cwnd after loss (optional) */ | ||
| 1148 | u32 (*undo_cwnd)(struct tcp_sock *tp); | ||
| 1149 | /* hook for packet ack accounting (optional) */ | ||
| 1150 | void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked); | ||
| 1151 | /* get info for tcp_diag (optional) */ | ||
| 1152 | void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb); | ||
| 1153 | |||
| 1154 | char name[TCP_CA_NAME_MAX]; | ||
| 1155 | struct module *owner; | ||
| 1156 | }; | ||
| 1157 | |||
| 1158 | extern int tcp_register_congestion_control(struct tcp_congestion_ops *type); | ||
| 1159 | extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); | ||
| 1160 | |||
| 1161 | extern void tcp_init_congestion_control(struct tcp_sock *tp); | ||
| 1162 | extern void tcp_cleanup_congestion_control(struct tcp_sock *tp); | ||
| 1163 | extern int tcp_set_default_congestion_control(const char *name); | ||
| 1164 | extern void tcp_get_default_congestion_control(char *name); | ||
| 1165 | |||
| 1166 | extern struct tcp_congestion_ops tcp_reno; | ||
| 1167 | extern u32 tcp_reno_ssthresh(struct tcp_sock *tp); | ||
| 1168 | extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, | ||
| 1169 | u32 rtt, u32 in_flight, int flag); | ||
| 1170 | extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp); | ||
| 1171 | |||
| 1172 | static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) | ||
| 1173 | { | ||
| 1174 | if (tp->ca_ops->set_state) | ||
| 1175 | tp->ca_ops->set_state(tp, ca_state); | ||
| 1176 | tp->ca_state = ca_state; | ||
| 1177 | } | ||
| 1178 | |||
| 1179 | static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event) | ||
| 1180 | { | ||
| 1181 | if (tp->ca_ops->cwnd_event) | ||
| 1182 | tp->ca_ops->cwnd_event(tp, event); | ||
| 1183 | } | ||
| 1184 | |||
| 1139 | /* This determines how many packets are "in the network" to the best | 1185 | /* This determines how many packets are "in the network" to the best |
| 1140 | * of our knowledge. In many cases it is conservative, but where | 1186 | * of our knowledge. In many cases it is conservative, but where |
| 1141 | * detailed information is available from the receiver (via SACK | 1187 | * detailed information is available from the receiver (via SACK |
| @@ -1155,91 +1201,6 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) | |||
| 1155 | return (tp->packets_out - tp->left_out + tp->retrans_out); | 1201 | return (tp->packets_out - tp->left_out + tp->retrans_out); |
| 1156 | } | 1202 | } |
| 1157 | 1203 | ||
| 1158 | /* | ||
| 1159 | * Which congestion algorithim is in use on the connection. | ||
| 1160 | */ | ||
| 1161 | #define tcp_is_vegas(__tp) ((__tp)->adv_cong == TCP_VEGAS) | ||
| 1162 | #define tcp_is_westwood(__tp) ((__tp)->adv_cong == TCP_WESTWOOD) | ||
| 1163 | #define tcp_is_bic(__tp) ((__tp)->adv_cong == TCP_BIC) | ||
| 1164 | |||
| 1165 | /* Recalculate snd_ssthresh, we want to set it to: | ||
| 1166 | * | ||
| 1167 | * Reno: | ||
| 1168 | * one half the current congestion window, but no | ||
| 1169 | * less than two segments | ||
| 1170 | * | ||
| 1171 | * BIC: | ||
| 1172 | * behave like Reno until low_window is reached, | ||
| 1173 | * then increase congestion window slowly | ||
| 1174 | */ | ||
| 1175 | static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp) | ||
| 1176 | { | ||
| 1177 | if (tcp_is_bic(tp)) { | ||
| 1178 | if (sysctl_tcp_bic_fast_convergence && | ||
| 1179 | tp->snd_cwnd < tp->bictcp.last_max_cwnd) | ||
| 1180 | tp->bictcp.last_max_cwnd = (tp->snd_cwnd * | ||
| 1181 | (BICTCP_BETA_SCALE | ||
| 1182 | + sysctl_tcp_bic_beta)) | ||
| 1183 | / (2 * BICTCP_BETA_SCALE); | ||
| 1184 | else | ||
| 1185 | tp->bictcp.last_max_cwnd = tp->snd_cwnd; | ||
| 1186 | |||
| 1187 | if (tp->snd_cwnd > sysctl_tcp_bic_low_window) | ||
| 1188 | return max((tp->snd_cwnd * sysctl_tcp_bic_beta) | ||
| 1189 | / BICTCP_BETA_SCALE, 2U); | ||
| 1190 | } | ||
| 1191 | |||
| 1192 | return max(tp->snd_cwnd >> 1U, 2U); | ||
| 1193 | } | ||
| 1194 | |||
| 1195 | /* Stop taking Vegas samples for now. */ | ||
| 1196 | #define tcp_vegas_disable(__tp) ((__tp)->vegas.doing_vegas_now = 0) | ||
| 1197 | |||
| 1198 | static inline void tcp_vegas_enable(struct tcp_sock *tp) | ||
| 1199 | { | ||
| 1200 | /* There are several situations when we must "re-start" Vegas: | ||
| 1201 | * | ||
| 1202 | * o when a connection is established | ||
| 1203 | * o after an RTO | ||
| 1204 | * o after fast recovery | ||
| 1205 | * o when we send a packet and there is no outstanding | ||
| 1206 | * unacknowledged data (restarting an idle connection) | ||
| 1207 | * | ||
| 1208 | * In these circumstances we cannot do a Vegas calculation at the | ||
| 1209 | * end of the first RTT, because any calculation we do is using | ||
| 1210 | * stale info -- both the saved cwnd and congestion feedback are | ||
| 1211 | * stale. | ||
| 1212 | * | ||
| 1213 | * Instead we must wait until the completion of an RTT during | ||
| 1214 | * which we actually receive ACKs. | ||
| 1215 | */ | ||
| 1216 | |||
| 1217 | /* Begin taking Vegas samples next time we send something. */ | ||
| 1218 | tp->vegas.doing_vegas_now = 1; | ||
| 1219 | |||
| 1220 | /* Set the beginning of the next send window. */ | ||
| 1221 | tp->vegas.beg_snd_nxt = tp->snd_nxt; | ||
| 1222 | |||
| 1223 | tp->vegas.cntRTT = 0; | ||
| 1224 | tp->vegas.minRTT = 0x7fffffff; | ||
| 1225 | } | ||
| 1226 | |||
| 1227 | /* Should we be taking Vegas samples right now? */ | ||
| 1228 | #define tcp_vegas_enabled(__tp) ((__tp)->vegas.doing_vegas_now) | ||
| 1229 | |||
| 1230 | extern void tcp_ca_init(struct tcp_sock *tp); | ||
| 1231 | |||
| 1232 | static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) | ||
| 1233 | { | ||
| 1234 | if (tcp_is_vegas(tp)) { | ||
| 1235 | if (ca_state == TCP_CA_Open) | ||
| 1236 | tcp_vegas_enable(tp); | ||
| 1237 | else | ||
| 1238 | tcp_vegas_disable(tp); | ||
| 1239 | } | ||
| 1240 | tp->ca_state = ca_state; | ||
| 1241 | } | ||
| 1242 | |||
| 1243 | /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. | 1204 | /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. |
| 1244 | * The exception is rate halving phase, when cwnd is decreasing towards | 1205 | * The exception is rate halving phase, when cwnd is decreasing towards |
| 1245 | * ssthresh. | 1206 | * ssthresh. |
| @@ -1288,7 +1249,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) | |||
| 1288 | static inline void __tcp_enter_cwr(struct tcp_sock *tp) | 1249 | static inline void __tcp_enter_cwr(struct tcp_sock *tp) |
| 1289 | { | 1250 | { |
| 1290 | tp->undo_marker = 0; | 1251 | tp->undo_marker = 0; |
| 1291 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1252 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
| 1292 | tp->snd_cwnd = min(tp->snd_cwnd, | 1253 | tp->snd_cwnd = min(tp->snd_cwnd, |
| 1293 | tcp_packets_in_flight(tp) + 1U); | 1254 | tcp_packets_in_flight(tp) + 1U); |
| 1294 | tp->snd_cwnd_cnt = 0; | 1255 | tp->snd_cwnd_cnt = 0; |
| @@ -1876,52 +1837,4 @@ struct tcp_iter_state { | |||
| 1876 | extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo); | 1837 | extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo); |
| 1877 | extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo); | 1838 | extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo); |
| 1878 | 1839 | ||
| 1879 | /* TCP Westwood functions and constants */ | ||
| 1880 | |||
| 1881 | #define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ | ||
| 1882 | #define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ | ||
| 1883 | |||
| 1884 | static inline void tcp_westwood_update_rtt(struct tcp_sock *tp, __u32 rtt_seq) | ||
| 1885 | { | ||
| 1886 | if (tcp_is_westwood(tp)) | ||
| 1887 | tp->westwood.rtt = rtt_seq; | ||
| 1888 | } | ||
| 1889 | |||
| 1890 | static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp) | ||
| 1891 | { | ||
| 1892 | return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) / | ||
| 1893 | (__u32) (tp->mss_cache_std), | ||
| 1894 | 2U); | ||
| 1895 | } | ||
| 1896 | |||
| 1897 | static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp) | ||
| 1898 | { | ||
| 1899 | return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0; | ||
| 1900 | } | ||
| 1901 | |||
| 1902 | static inline int tcp_westwood_ssthresh(struct tcp_sock *tp) | ||
| 1903 | { | ||
| 1904 | __u32 ssthresh = 0; | ||
| 1905 | |||
| 1906 | if (tcp_is_westwood(tp)) { | ||
| 1907 | ssthresh = __tcp_westwood_bw_rttmin(tp); | ||
| 1908 | if (ssthresh) | ||
| 1909 | tp->snd_ssthresh = ssthresh; | ||
| 1910 | } | ||
| 1911 | |||
| 1912 | return (ssthresh != 0); | ||
| 1913 | } | ||
| 1914 | |||
| 1915 | static inline int tcp_westwood_cwnd(struct tcp_sock *tp) | ||
| 1916 | { | ||
| 1917 | __u32 cwnd = 0; | ||
| 1918 | |||
| 1919 | if (tcp_is_westwood(tp)) { | ||
| 1920 | cwnd = __tcp_westwood_bw_rttmin(tp); | ||
| 1921 | if (cwnd) | ||
| 1922 | tp->snd_cwnd = cwnd; | ||
| 1923 | } | ||
| 1924 | |||
| 1925 | return (cwnd != 0); | ||
| 1926 | } | ||
| 1927 | #endif /* _TCP_H */ | 1840 | #endif /* _TCP_H */ |
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 65d57d8e1add..89c0b4cb470e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
| @@ -5,7 +5,8 @@ | |||
| 5 | obj-y := utils.o route.o inetpeer.o protocol.o \ | 5 | obj-y := utils.o route.o inetpeer.o protocol.o \ |
| 6 | ip_input.o ip_fragment.o ip_forward.o ip_options.o \ | 6 | ip_input.o ip_fragment.o ip_forward.o ip_options.o \ |
| 7 | ip_output.o ip_sockglue.o \ | 7 | ip_output.o ip_sockglue.o \ |
| 8 | tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ | 8 | tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ |
| 9 | tcp_minisocks.o tcp_cong.o \ | ||
| 9 | datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ | 10 | datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ |
| 10 | sysctl_net_ipv4.o fib_frontend.o fib_semantics.o | 11 | sysctl_net_ipv4.o fib_frontend.o fib_semantics.o |
| 11 | 12 | ||
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 23068bddbf0b..e32894532416 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
| @@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table, | |||
| 118 | return 1; | 118 | return 1; |
| 119 | } | 119 | } |
| 120 | 120 | ||
| 121 | static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp, | ||
| 122 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 123 | { | ||
| 124 | char val[TCP_CA_NAME_MAX]; | ||
| 125 | ctl_table tbl = { | ||
| 126 | .data = val, | ||
| 127 | .maxlen = TCP_CA_NAME_MAX, | ||
| 128 | }; | ||
| 129 | int ret; | ||
| 130 | |||
| 131 | tcp_get_default_congestion_control(val); | ||
| 132 | |||
| 133 | ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); | ||
| 134 | if (write && ret == 0) | ||
| 135 | ret = tcp_set_default_congestion_control(val); | ||
| 136 | return ret; | ||
| 137 | } | ||
| 138 | |||
| 139 | int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen, | ||
| 140 | void __user *oldval, size_t __user *oldlenp, | ||
| 141 | void __user *newval, size_t newlen, | ||
| 142 | void **context) | ||
| 143 | { | ||
| 144 | char val[TCP_CA_NAME_MAX]; | ||
| 145 | ctl_table tbl = { | ||
| 146 | .data = val, | ||
| 147 | .maxlen = TCP_CA_NAME_MAX, | ||
| 148 | }; | ||
| 149 | int ret; | ||
| 150 | |||
| 151 | tcp_get_default_congestion_control(val); | ||
| 152 | ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen, | ||
| 153 | context); | ||
| 154 | if (ret == 0 && newval && newlen) | ||
| 155 | ret = tcp_set_default_congestion_control(val); | ||
| 156 | return ret; | ||
| 157 | } | ||
| 158 | |||
| 159 | |||
| 121 | ctl_table ipv4_table[] = { | 160 | ctl_table ipv4_table[] = { |
| 122 | { | 161 | { |
| 123 | .ctl_name = NET_IPV4_TCP_TIMESTAMPS, | 162 | .ctl_name = NET_IPV4_TCP_TIMESTAMPS, |
| @@ -612,70 +651,6 @@ ctl_table ipv4_table[] = { | |||
| 612 | .proc_handler = &proc_dointvec, | 651 | .proc_handler = &proc_dointvec, |
| 613 | }, | 652 | }, |
| 614 | { | 653 | { |
| 615 | .ctl_name = NET_TCP_WESTWOOD, | ||
| 616 | .procname = "tcp_westwood", | ||
| 617 | .data = &sysctl_tcp_westwood, | ||
| 618 | .maxlen = sizeof(int), | ||
| 619 | .mode = 0644, | ||
| 620 | .proc_handler = &proc_dointvec, | ||
| 621 | }, | ||
| 622 | { | ||
| 623 | .ctl_name = NET_TCP_VEGAS, | ||
| 624 | .procname = "tcp_vegas_cong_avoid", | ||
| 625 | .data = &sysctl_tcp_vegas_cong_avoid, | ||
| 626 | .maxlen = sizeof(int), | ||
| 627 | .mode = 0644, | ||
| 628 | .proc_handler = &proc_dointvec, | ||
| 629 | }, | ||
| 630 | { | ||
| 631 | .ctl_name = NET_TCP_VEGAS_ALPHA, | ||
| 632 | .procname = "tcp_vegas_alpha", | ||
| 633 | .data = &sysctl_tcp_vegas_alpha, | ||
| 634 | .maxlen = sizeof(int), | ||
| 635 | .mode = 0644, | ||
| 636 | .proc_handler = &proc_dointvec, | ||
| 637 | }, | ||
| 638 | { | ||
| 639 | .ctl_name = NET_TCP_VEGAS_BETA, | ||
| 640 | .procname = "tcp_vegas_beta", | ||
| 641 | .data = &sysctl_tcp_vegas_beta, | ||
| 642 | .maxlen = sizeof(int), | ||
| 643 | .mode = 0644, | ||
| 644 | .proc_handler = &proc_dointvec, | ||
| 645 | }, | ||
| 646 | { | ||
| 647 | .ctl_name = NET_TCP_VEGAS_GAMMA, | ||
| 648 | .procname = "tcp_vegas_gamma", | ||
| 649 | .data = &sysctl_tcp_vegas_gamma, | ||
| 650 | .maxlen = sizeof(int), | ||
| 651 | .mode = 0644, | ||
| 652 | .proc_handler = &proc_dointvec, | ||
| 653 | }, | ||
| 654 | { | ||
| 655 | .ctl_name = NET_TCP_BIC, | ||
| 656 | .procname = "tcp_bic", | ||
| 657 | .data = &sysctl_tcp_bic, | ||
| 658 | .maxlen = sizeof(int), | ||
| 659 | .mode = 0644, | ||
| 660 | .proc_handler = &proc_dointvec, | ||
| 661 | }, | ||
| 662 | { | ||
| 663 | .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE, | ||
| 664 | .procname = "tcp_bic_fast_convergence", | ||
| 665 | .data = &sysctl_tcp_bic_fast_convergence, | ||
| 666 | .maxlen = sizeof(int), | ||
| 667 | .mode = 0644, | ||
| 668 | .proc_handler = &proc_dointvec, | ||
| 669 | }, | ||
| 670 | { | ||
| 671 | .ctl_name = NET_TCP_BIC_LOW_WINDOW, | ||
| 672 | .procname = "tcp_bic_low_window", | ||
| 673 | .data = &sysctl_tcp_bic_low_window, | ||
| 674 | .maxlen = sizeof(int), | ||
| 675 | .mode = 0644, | ||
| 676 | .proc_handler = &proc_dointvec, | ||
| 677 | }, | ||
| 678 | { | ||
| 679 | .ctl_name = NET_TCP_MODERATE_RCVBUF, | 654 | .ctl_name = NET_TCP_MODERATE_RCVBUF, |
| 680 | .procname = "tcp_moderate_rcvbuf", | 655 | .procname = "tcp_moderate_rcvbuf", |
| 681 | .data = &sysctl_tcp_moderate_rcvbuf, | 656 | .data = &sysctl_tcp_moderate_rcvbuf, |
| @@ -692,13 +667,14 @@ ctl_table ipv4_table[] = { | |||
| 692 | .proc_handler = &proc_dointvec, | 667 | .proc_handler = &proc_dointvec, |
| 693 | }, | 668 | }, |
| 694 | { | 669 | { |
| 695 | .ctl_name = NET_TCP_BIC_BETA, | 670 | .ctl_name = NET_TCP_CONG_CONTROL, |
| 696 | .procname = "tcp_bic_beta", | 671 | .procname = "tcp_congestion_control", |
| 697 | .data = &sysctl_tcp_bic_beta, | ||
| 698 | .maxlen = sizeof(int), | ||
| 699 | .mode = 0644, | 672 | .mode = 0644, |
| 700 | .proc_handler = &proc_dointvec, | 673 | .maxlen = TCP_CA_NAME_MAX, |
| 674 | .proc_handler = &proc_tcp_congestion_control, | ||
| 675 | .strategy = &sysctl_tcp_congestion_control, | ||
| 701 | }, | 676 | }, |
| 677 | |||
| 702 | { .ctl_name = 0 } | 678 | { .ctl_name = 0 } |
| 703 | }; | 679 | }; |
| 704 | 680 | ||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 674bbd8cfd36..f3dbc8dc1263 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
| @@ -2333,6 +2333,8 @@ void __init tcp_init(void) | |||
| 2333 | printk(KERN_INFO "TCP: Hash tables configured " | 2333 | printk(KERN_INFO "TCP: Hash tables configured " |
| 2334 | "(established %d bind %d)\n", | 2334 | "(established %d bind %d)\n", |
| 2335 | tcp_ehash_size << 1, tcp_bhash_size); | 2335 | tcp_ehash_size << 1, tcp_bhash_size); |
| 2336 | |||
| 2337 | tcp_register_congestion_control(&tcp_reno); | ||
| 2336 | } | 2338 | } |
| 2337 | 2339 | ||
| 2338 | EXPORT_SYMBOL(tcp_accept); | 2340 | EXPORT_SYMBOL(tcp_accept); |
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c new file mode 100644 index 000000000000..665394a63ae4 --- /dev/null +++ b/net/ipv4/tcp_cong.c | |||
| @@ -0,0 +1,195 @@ | |||
| 1 | /* | ||
| 2 | * Plugable TCP congestion control support and newReno | ||
| 3 | * congestion control. | ||
| 4 | * Based on ideas from I/O scheduler suport and Web100. | ||
| 5 | * | ||
| 6 | * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/config.h> | ||
| 10 | #include <linux/module.h> | ||
| 11 | #include <linux/mm.h> | ||
| 12 | #include <linux/types.h> | ||
| 13 | #include <linux/list.h> | ||
| 14 | #include <net/tcp.h> | ||
| 15 | |||
| 16 | static DEFINE_SPINLOCK(tcp_cong_list_lock); | ||
| 17 | static LIST_HEAD(tcp_cong_list); | ||
| 18 | |||
| 19 | /* Simple linear search, don't expect many entries! */ | ||
| 20 | static struct tcp_congestion_ops *tcp_ca_find(const char *name) | ||
| 21 | { | ||
| 22 | struct tcp_congestion_ops *e; | ||
| 23 | |||
| 24 | list_for_each_entry(e, &tcp_cong_list, list) { | ||
| 25 | if (strcmp(e->name, name) == 0) | ||
| 26 | return e; | ||
| 27 | } | ||
| 28 | |||
| 29 | return NULL; | ||
| 30 | } | ||
| 31 | |||
| 32 | /* | ||
| 33 | * Attach new congestion control algorthim to the list | ||
| 34 | * of available options. | ||
| 35 | */ | ||
| 36 | int tcp_register_congestion_control(struct tcp_congestion_ops *ca) | ||
| 37 | { | ||
| 38 | int ret = 0; | ||
| 39 | |||
| 40 | /* all algorithms must implement ssthresh and cong_avoid ops */ | ||
| 41 | if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { | ||
| 42 | printk(KERN_ERR "TCP %s does not implement required ops\n", | ||
| 43 | ca->name); | ||
| 44 | return -EINVAL; | ||
| 45 | } | ||
| 46 | |||
| 47 | spin_lock(&tcp_cong_list_lock); | ||
| 48 | if (tcp_ca_find(ca->name)) { | ||
| 49 | printk(KERN_NOTICE "TCP %s already registered\n", ca->name); | ||
| 50 | ret = -EEXIST; | ||
| 51 | } else { | ||
| 52 | list_add_rcu(&ca->list, &tcp_cong_list); | ||
| 53 | printk(KERN_INFO "TCP %s registered\n", ca->name); | ||
| 54 | } | ||
| 55 | spin_unlock(&tcp_cong_list_lock); | ||
| 56 | |||
| 57 | return ret; | ||
| 58 | } | ||
| 59 | EXPORT_SYMBOL_GPL(tcp_register_congestion_control); | ||
| 60 | |||
| 61 | /* | ||
| 62 | * Remove congestion control algorithm, called from | ||
| 63 | * the module's remove function. Module ref counts are used | ||
| 64 | * to ensure that this can't be done till all sockets using | ||
| 65 | * that method are closed. | ||
| 66 | */ | ||
| 67 | void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) | ||
| 68 | { | ||
| 69 | spin_lock(&tcp_cong_list_lock); | ||
| 70 | list_del_rcu(&ca->list); | ||
| 71 | spin_unlock(&tcp_cong_list_lock); | ||
| 72 | } | ||
| 73 | EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); | ||
| 74 | |||
| 75 | /* Assign choice of congestion control. */ | ||
| 76 | void tcp_init_congestion_control(struct tcp_sock *tp) | ||
| 77 | { | ||
| 78 | struct tcp_congestion_ops *ca; | ||
| 79 | |||
| 80 | rcu_read_lock(); | ||
| 81 | list_for_each_entry_rcu(ca, &tcp_cong_list, list) { | ||
| 82 | if (try_module_get(ca->owner)) { | ||
| 83 | tp->ca_ops = ca; | ||
| 84 | break; | ||
| 85 | } | ||
| 86 | |||
| 87 | } | ||
| 88 | rcu_read_unlock(); | ||
| 89 | |||
| 90 | if (tp->ca_ops->init) | ||
| 91 | tp->ca_ops->init(tp); | ||
| 92 | } | ||
| 93 | |||
| 94 | /* Manage refcounts on socket close. */ | ||
| 95 | void tcp_cleanup_congestion_control(struct tcp_sock *tp) | ||
| 96 | { | ||
| 97 | if (tp->ca_ops->release) | ||
| 98 | tp->ca_ops->release(tp); | ||
| 99 | module_put(tp->ca_ops->owner); | ||
| 100 | } | ||
| 101 | |||
| 102 | /* Used by sysctl to change default congestion control */ | ||
| 103 | int tcp_set_default_congestion_control(const char *name) | ||
| 104 | { | ||
| 105 | struct tcp_congestion_ops *ca; | ||
| 106 | int ret = -ENOENT; | ||
| 107 | |||
| 108 | spin_lock(&tcp_cong_list_lock); | ||
| 109 | ca = tcp_ca_find(name); | ||
| 110 | #ifdef CONFIG_KMOD | ||
| 111 | if (!ca) { | ||
| 112 | spin_unlock(&tcp_cong_list_lock); | ||
| 113 | |||
| 114 | request_module("tcp_%s", name); | ||
| 115 | spin_lock(&tcp_cong_list_lock); | ||
| 116 | ca = tcp_ca_find(name); | ||
| 117 | } | ||
| 118 | #endif | ||
| 119 | |||
| 120 | if (ca) { | ||
| 121 | list_move(&ca->list, &tcp_cong_list); | ||
| 122 | ret = 0; | ||
| 123 | } | ||
| 124 | spin_unlock(&tcp_cong_list_lock); | ||
| 125 | |||
| 126 | return ret; | ||
| 127 | } | ||
| 128 | |||
| 129 | /* Get current default congestion control */ | ||
| 130 | void tcp_get_default_congestion_control(char *name) | ||
| 131 | { | ||
| 132 | struct tcp_congestion_ops *ca; | ||
| 133 | /* We will always have reno... */ | ||
| 134 | BUG_ON(list_empty(&tcp_cong_list)); | ||
| 135 | |||
| 136 | rcu_read_lock(); | ||
| 137 | ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); | ||
| 138 | strncpy(name, ca->name, TCP_CA_NAME_MAX); | ||
| 139 | rcu_read_unlock(); | ||
| 140 | } | ||
| 141 | |||
| 142 | /* | ||
| 143 | * TCP Reno congestion control | ||
| 144 | * This is special case used for fallback as well. | ||
| 145 | */ | ||
| 146 | /* This is Jacobson's slow start and congestion avoidance. | ||
| 147 | * SIGCOMM '88, p. 328. | ||
| 148 | */ | ||
| 149 | void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, | ||
| 150 | int flag) | ||
| 151 | { | ||
| 152 | if (in_flight < tp->snd_cwnd) | ||
| 153 | return; | ||
| 154 | |||
| 155 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | ||
| 156 | /* In "safe" area, increase. */ | ||
| 157 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 158 | tp->snd_cwnd++; | ||
| 159 | } else { | ||
| 160 | /* In dangerous area, increase slowly. | ||
| 161 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | ||
| 162 | */ | ||
| 163 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | ||
| 164 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 165 | tp->snd_cwnd++; | ||
| 166 | tp->snd_cwnd_cnt = 0; | ||
| 167 | } else | ||
| 168 | tp->snd_cwnd_cnt++; | ||
| 169 | } | ||
| 170 | } | ||
| 171 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); | ||
| 172 | |||
| 173 | /* Slow start threshold is half the congestion window (min 2) */ | ||
| 174 | u32 tcp_reno_ssthresh(struct tcp_sock *tp) | ||
| 175 | { | ||
| 176 | return max(tp->snd_cwnd >> 1U, 2U); | ||
| 177 | } | ||
| 178 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); | ||
| 179 | |||
| 180 | /* Lower bound on congestion window. */ | ||
| 181 | u32 tcp_reno_min_cwnd(struct tcp_sock *tp) | ||
| 182 | { | ||
| 183 | return tp->snd_ssthresh/2; | ||
| 184 | } | ||
| 185 | EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); | ||
| 186 | |||
| 187 | struct tcp_congestion_ops tcp_reno = { | ||
| 188 | .name = "reno", | ||
| 189 | .owner = THIS_MODULE, | ||
| 190 | .ssthresh = tcp_reno_ssthresh, | ||
| 191 | .cong_avoid = tcp_reno_cong_avoid, | ||
| 192 | .min_cwnd = tcp_reno_min_cwnd, | ||
| 193 | }; | ||
| 194 | |||
| 195 | EXPORT_SYMBOL_GPL(tcp_reno); | ||
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 634befc07921..867acc0f79d8 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c | |||
| @@ -42,7 +42,6 @@ struct tcpdiag_entry | |||
| 42 | 42 | ||
| 43 | static struct sock *tcpnl; | 43 | static struct sock *tcpnl; |
| 44 | 44 | ||
| 45 | |||
| 46 | #define TCPDIAG_PUT(skb, attrtype, attrlen) \ | 45 | #define TCPDIAG_PUT(skb, attrtype, attrlen) \ |
| 47 | ({ int rtalen = RTA_LENGTH(attrlen); \ | 46 | ({ int rtalen = RTA_LENGTH(attrlen); \ |
| 48 | struct rtattr *rta; \ | 47 | struct rtattr *rta; \ |
| @@ -61,7 +60,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, | |||
| 61 | struct nlmsghdr *nlh; | 60 | struct nlmsghdr *nlh; |
| 62 | struct tcp_info *info = NULL; | 61 | struct tcp_info *info = NULL; |
| 63 | struct tcpdiag_meminfo *minfo = NULL; | 62 | struct tcpdiag_meminfo *minfo = NULL; |
| 64 | struct tcpvegas_info *vinfo = NULL; | ||
| 65 | unsigned char *b = skb->tail; | 63 | unsigned char *b = skb->tail; |
| 66 | 64 | ||
| 67 | nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); | 65 | nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); |
| @@ -73,9 +71,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, | |||
| 73 | if (ext & (1<<(TCPDIAG_INFO-1))) | 71 | if (ext & (1<<(TCPDIAG_INFO-1))) |
| 74 | info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); | 72 | info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); |
| 75 | 73 | ||
| 76 | if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) | ||
| 77 | && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) | ||
| 78 | vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); | ||
| 79 | } | 74 | } |
| 80 | r->tcpdiag_family = sk->sk_family; | 75 | r->tcpdiag_family = sk->sk_family; |
| 81 | r->tcpdiag_state = sk->sk_state; | 76 | r->tcpdiag_state = sk->sk_state; |
| @@ -166,19 +161,8 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, | |||
| 166 | if (info) | 161 | if (info) |
| 167 | tcp_get_info(sk, info); | 162 | tcp_get_info(sk, info); |
| 168 | 163 | ||
| 169 | if (vinfo) { | 164 | if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info) |
| 170 | if (tcp_is_vegas(tp)) { | 165 | tp->ca_ops->get_info(tp, ext, skb); |
| 171 | vinfo->tcpv_enabled = tp->vegas.doing_vegas_now; | ||
| 172 | vinfo->tcpv_rttcnt = tp->vegas.cntRTT; | ||
| 173 | vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT); | ||
| 174 | vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT); | ||
| 175 | } else { | ||
| 176 | vinfo->tcpv_enabled = 0; | ||
| 177 | vinfo->tcpv_rttcnt = 0; | ||
| 178 | vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt); | ||
| 179 | vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min); | ||
| 180 | } | ||
| 181 | } | ||
| 182 | 166 | ||
| 183 | nlh->nlmsg_len = skb->tail - b; | 167 | nlh->nlmsg_len = skb->tail - b; |
| 184 | return skb->len; | 168 | return skb->len; |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bad504630a3..7bbbbc33eb4b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
| @@ -61,7 +61,6 @@ | |||
| 61 | * Panu Kuhlberg: Experimental audit of TCP (re)transmission | 61 | * Panu Kuhlberg: Experimental audit of TCP (re)transmission |
| 62 | * engine. Lots of bugs are found. | 62 | * engine. Lots of bugs are found. |
| 63 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs | 63 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs |
| 64 | * Angelo Dell'Aera: TCP Westwood+ support | ||
| 65 | */ | 64 | */ |
| 66 | 65 | ||
| 67 | #include <linux/config.h> | 66 | #include <linux/config.h> |
| @@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337; | |||
| 88 | int sysctl_tcp_max_orphans = NR_FILE; | 87 | int sysctl_tcp_max_orphans = NR_FILE; |
| 89 | int sysctl_tcp_frto; | 88 | int sysctl_tcp_frto; |
| 90 | int sysctl_tcp_nometrics_save; | 89 | int sysctl_tcp_nometrics_save; |
| 91 | int sysctl_tcp_westwood; | ||
| 92 | int sysctl_tcp_vegas_cong_avoid; | ||
| 93 | 90 | ||
| 94 | int sysctl_tcp_moderate_rcvbuf = 1; | 91 | int sysctl_tcp_moderate_rcvbuf = 1; |
| 95 | 92 | ||
| 96 | /* Default values of the Vegas variables, in fixed-point representation | ||
| 97 | * with V_PARAM_SHIFT bits to the right of the binary point. | ||
| 98 | */ | ||
| 99 | #define V_PARAM_SHIFT 1 | ||
| 100 | int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT; | ||
| 101 | int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT; | ||
| 102 | int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT; | ||
| 103 | int sysctl_tcp_bic = 1; | ||
| 104 | int sysctl_tcp_bic_fast_convergence = 1; | ||
| 105 | int sysctl_tcp_bic_low_window = 14; | ||
| 106 | int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ | ||
| 107 | |||
| 108 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ | 93 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
| 109 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | 94 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
| 110 | #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ | 95 | #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ |
| @@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk) | |||
| 333 | tp->snd_cwnd_stamp = tcp_time_stamp; | 318 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 334 | } | 319 | } |
| 335 | 320 | ||
| 336 | static void init_bictcp(struct tcp_sock *tp) | ||
| 337 | { | ||
| 338 | tp->bictcp.cnt = 0; | ||
| 339 | |||
| 340 | tp->bictcp.last_max_cwnd = 0; | ||
| 341 | tp->bictcp.last_cwnd = 0; | ||
| 342 | tp->bictcp.last_stamp = 0; | ||
| 343 | } | ||
| 344 | |||
| 345 | /* 5. Recalculate window clamp after socket hit its memory bounds. */ | 321 | /* 5. Recalculate window clamp after socket hit its memory bounds. */ |
| 346 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) | 322 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) |
| 347 | { | 323 | { |
| @@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
| 558 | tcp_grow_window(sk, tp, skb); | 534 | tcp_grow_window(sk, tp, skb); |
| 559 | } | 535 | } |
| 560 | 536 | ||
| 561 | /* When starting a new connection, pin down the current choice of | ||
| 562 | * congestion algorithm. | ||
| 563 | */ | ||
| 564 | void tcp_ca_init(struct tcp_sock *tp) | ||
| 565 | { | ||
| 566 | if (sysctl_tcp_westwood) | ||
| 567 | tp->adv_cong = TCP_WESTWOOD; | ||
| 568 | else if (sysctl_tcp_bic) | ||
| 569 | tp->adv_cong = TCP_BIC; | ||
| 570 | else if (sysctl_tcp_vegas_cong_avoid) { | ||
| 571 | tp->adv_cong = TCP_VEGAS; | ||
| 572 | tp->vegas.baseRTT = 0x7fffffff; | ||
| 573 | tcp_vegas_enable(tp); | ||
| 574 | } | ||
| 575 | } | ||
| 576 | |||
| 577 | /* Do RTT sampling needed for Vegas. | ||
| 578 | * Basically we: | ||
| 579 | * o min-filter RTT samples from within an RTT to get the current | ||
| 580 | * propagation delay + queuing delay (we are min-filtering to try to | ||
| 581 | * avoid the effects of delayed ACKs) | ||
| 582 | * o min-filter RTT samples from a much longer window (forever for now) | ||
| 583 | * to find the propagation delay (baseRTT) | ||
| 584 | */ | ||
| 585 | static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) | ||
| 586 | { | ||
| 587 | __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ | ||
| 588 | |||
| 589 | /* Filter to find propagation delay: */ | ||
| 590 | if (vrtt < tp->vegas.baseRTT) | ||
| 591 | tp->vegas.baseRTT = vrtt; | ||
| 592 | |||
| 593 | /* Find the min RTT during the last RTT to find | ||
| 594 | * the current prop. delay + queuing delay: | ||
| 595 | */ | ||
| 596 | tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); | ||
| 597 | tp->vegas.cntRTT++; | ||
| 598 | } | ||
| 599 | |||
| 600 | /* Called to compute a smoothed rtt estimate. The data fed to this | 537 | /* Called to compute a smoothed rtt estimate. The data fed to this |
| 601 | * routine either comes from timestamps, or from segments that were | 538 | * routine either comes from timestamps, or from segments that were |
| 602 | * known _not_ to have been retransmitted [see Karn/Partridge | 539 | * known _not_ to have been retransmitted [see Karn/Partridge |
| @@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) | |||
| 606 | * To save cycles in the RFC 1323 implementation it was better to break | 543 | * To save cycles in the RFC 1323 implementation it was better to break |
| 607 | * it up into three procedures. -- erics | 544 | * it up into three procedures. -- erics |
| 608 | */ | 545 | */ |
| 609 | static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) | 546 | static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) |
| 610 | { | 547 | { |
| 611 | long m = mrtt; /* RTT */ | 548 | long m = mrtt; /* RTT */ |
| 612 | 549 | ||
| 613 | if (tcp_vegas_enabled(tp)) | ||
| 614 | vegas_rtt_calc(tp, mrtt); | ||
| 615 | |||
| 616 | /* The following amusing code comes from Jacobson's | 550 | /* The following amusing code comes from Jacobson's |
| 617 | * article in SIGCOMM '88. Note that rtt and mdev | 551 | * article in SIGCOMM '88. Note that rtt and mdev |
| 618 | * are scaled versions of rtt and mean deviation. | 552 | * are scaled versions of rtt and mean deviation. |
| @@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) | |||
| 670 | tp->rtt_seq = tp->snd_nxt; | 604 | tp->rtt_seq = tp->snd_nxt; |
| 671 | } | 605 | } |
| 672 | 606 | ||
| 673 | tcp_westwood_update_rtt(tp, tp->srtt >> 3); | 607 | if (tp->ca_ops->rtt_sample) |
| 608 | tp->ca_ops->rtt_sample(tp, *usrtt); | ||
| 674 | } | 609 | } |
| 675 | 610 | ||
| 676 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 611 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
| @@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk) | |||
| 1185 | tp->snd_una == tp->high_seq || | 1120 | tp->snd_una == tp->high_seq || |
| 1186 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { | 1121 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { |
| 1187 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1122 | tp->prior_ssthresh = tcp_current_ssthresh(tp); |
| 1188 | if (!tcp_westwood_ssthresh(tp)) | 1123 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
| 1189 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1124 | tcp_ca_event(tp, CA_EVENT_FRTO); |
| 1190 | } | 1125 | } |
| 1191 | 1126 | ||
| 1192 | /* Have to clear retransmission markers here to keep the bookkeeping | 1127 | /* Have to clear retransmission markers here to keep the bookkeeping |
| @@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk) | |||
| 1252 | tcp_set_ca_state(tp, TCP_CA_Loss); | 1187 | tcp_set_ca_state(tp, TCP_CA_Loss); |
| 1253 | tp->high_seq = tp->frto_highmark; | 1188 | tp->high_seq = tp->frto_highmark; |
| 1254 | TCP_ECN_queue_cwr(tp); | 1189 | TCP_ECN_queue_cwr(tp); |
| 1255 | |||
| 1256 | init_bictcp(tp); | ||
| 1257 | } | 1190 | } |
| 1258 | 1191 | ||
| 1259 | void tcp_clear_retrans(struct tcp_sock *tp) | 1192 | void tcp_clear_retrans(struct tcp_sock *tp) |
| @@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
| 1283 | if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || | 1216 | if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || |
| 1284 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { | 1217 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { |
| 1285 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1218 | tp->prior_ssthresh = tcp_current_ssthresh(tp); |
| 1286 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1219 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
| 1220 | tcp_ca_event(tp, CA_EVENT_LOSS); | ||
| 1287 | } | 1221 | } |
| 1288 | tp->snd_cwnd = 1; | 1222 | tp->snd_cwnd = 1; |
| 1289 | tp->snd_cwnd_cnt = 0; | 1223 | tp->snd_cwnd_cnt = 0; |
| @@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) | |||
| 1596 | } | 1530 | } |
| 1597 | 1531 | ||
| 1598 | /* Decrease cwnd each second ack. */ | 1532 | /* Decrease cwnd each second ack. */ |
| 1599 | |||
| 1600 | static void tcp_cwnd_down(struct tcp_sock *tp) | 1533 | static void tcp_cwnd_down(struct tcp_sock *tp) |
| 1601 | { | 1534 | { |
| 1602 | int decr = tp->snd_cwnd_cnt + 1; | 1535 | int decr = tp->snd_cwnd_cnt + 1; |
| 1603 | __u32 limit; | ||
| 1604 | |||
| 1605 | /* | ||
| 1606 | * TCP Westwood | ||
| 1607 | * Here limit is evaluated as BWestimation*RTTmin (for obtaining it | ||
| 1608 | * in packets we use mss_cache). If sysctl_tcp_westwood is off | ||
| 1609 | * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is | ||
| 1610 | * still used as usual. It prevents other strange cases in which | ||
| 1611 | * BWE*RTTmin could assume value 0. It should not happen but... | ||
| 1612 | */ | ||
| 1613 | |||
| 1614 | if (!(limit = tcp_westwood_bw_rttmin(tp))) | ||
| 1615 | limit = tp->snd_ssthresh/2; | ||
| 1616 | 1536 | ||
| 1617 | tp->snd_cwnd_cnt = decr&1; | 1537 | tp->snd_cwnd_cnt = decr&1; |
| 1618 | decr >>= 1; | 1538 | decr >>= 1; |
| 1619 | 1539 | ||
| 1620 | if (decr && tp->snd_cwnd > limit) | 1540 | if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) |
| 1621 | tp->snd_cwnd -= decr; | 1541 | tp->snd_cwnd -= decr; |
| 1622 | 1542 | ||
| 1623 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); | 1543 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); |
| @@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) | |||
| 1654 | static void tcp_undo_cwr(struct tcp_sock *tp, int undo) | 1574 | static void tcp_undo_cwr(struct tcp_sock *tp, int undo) |
| 1655 | { | 1575 | { |
| 1656 | if (tp->prior_ssthresh) { | 1576 | if (tp->prior_ssthresh) { |
| 1657 | if (tcp_is_bic(tp)) | 1577 | if (tp->ca_ops->undo_cwnd) |
| 1658 | tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); | 1578 | tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); |
| 1659 | else | 1579 | else |
| 1660 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); | 1580 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); |
| 1661 | 1581 | ||
| @@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) | |||
| 1767 | 1687 | ||
| 1768 | static inline void tcp_complete_cwr(struct tcp_sock *tp) | 1688 | static inline void tcp_complete_cwr(struct tcp_sock *tp) |
| 1769 | { | 1689 | { |
| 1770 | if (tcp_westwood_cwnd(tp)) | 1690 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); |
| 1771 | tp->snd_ssthresh = tp->snd_cwnd; | ||
| 1772 | else | ||
| 1773 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); | ||
| 1774 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1691 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 1692 | tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); | ||
| 1775 | } | 1693 | } |
| 1776 | 1694 | ||
| 1777 | static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) | 1695 | static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) |
| @@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1946 | if (tp->ca_state < TCP_CA_CWR) { | 1864 | if (tp->ca_state < TCP_CA_CWR) { |
| 1947 | if (!(flag&FLAG_ECE)) | 1865 | if (!(flag&FLAG_ECE)) |
| 1948 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1866 | tp->prior_ssthresh = tcp_current_ssthresh(tp); |
| 1949 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1867 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
| 1950 | TCP_ECN_queue_cwr(tp); | 1868 | TCP_ECN_queue_cwr(tp); |
| 1951 | } | 1869 | } |
| 1952 | 1870 | ||
| @@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1963 | /* Read draft-ietf-tcplw-high-performance before mucking | 1881 | /* Read draft-ietf-tcplw-high-performance before mucking |
| 1964 | * with this code. (Superceeds RFC1323) | 1882 | * with this code. (Superceeds RFC1323) |
| 1965 | */ | 1883 | */ |
| 1966 | static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) | 1884 | static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) |
| 1967 | { | 1885 | { |
| 1968 | __u32 seq_rtt; | 1886 | __u32 seq_rtt; |
| 1969 | 1887 | ||
| @@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) | |||
| 1983 | * in window is lost... Voila. --ANK (010210) | 1901 | * in window is lost... Voila. --ANK (010210) |
| 1984 | */ | 1902 | */ |
| 1985 | seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | 1903 | seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; |
| 1986 | tcp_rtt_estimator(tp, seq_rtt); | 1904 | tcp_rtt_estimator(tp, seq_rtt, usrtt); |
| 1987 | tcp_set_rto(tp); | 1905 | tcp_set_rto(tp); |
| 1988 | tp->backoff = 0; | 1906 | tp->backoff = 0; |
| 1989 | tcp_bound_rto(tp); | 1907 | tcp_bound_rto(tp); |
| 1990 | } | 1908 | } |
| 1991 | 1909 | ||
| 1992 | static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) | 1910 | static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) |
| 1993 | { | 1911 | { |
| 1994 | /* We don't have a timestamp. Can only use | 1912 | /* We don't have a timestamp. Can only use |
| 1995 | * packets that are not retransmitted to determine | 1913 | * packets that are not retransmitted to determine |
| @@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) | |||
| 2003 | if (flag & FLAG_RETRANS_DATA_ACKED) | 1921 | if (flag & FLAG_RETRANS_DATA_ACKED) |
| 2004 | return; | 1922 | return; |
| 2005 | 1923 | ||
| 2006 | tcp_rtt_estimator(tp, seq_rtt); | 1924 | tcp_rtt_estimator(tp, seq_rtt, usrtt); |
| 2007 | tcp_set_rto(tp); | 1925 | tcp_set_rto(tp); |
| 2008 | tp->backoff = 0; | 1926 | tp->backoff = 0; |
| 2009 | tcp_bound_rto(tp); | 1927 | tcp_bound_rto(tp); |
| 2010 | } | 1928 | } |
| 2011 | 1929 | ||
| 2012 | static inline void tcp_ack_update_rtt(struct tcp_sock *tp, | 1930 | static inline void tcp_ack_update_rtt(struct tcp_sock *tp, |
| 2013 | int flag, s32 seq_rtt) | 1931 | int flag, s32 seq_rtt, u32 *usrtt) |
| 2014 | { | 1932 | { |
| 2015 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ | 1933 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ |
| 2016 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) | 1934 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
| 2017 | tcp_ack_saw_tstamp(tp, flag); | 1935 | tcp_ack_saw_tstamp(tp, usrtt, flag); |
| 2018 | else if (seq_rtt >= 0) | 1936 | else if (seq_rtt >= 0) |
| 2019 | tcp_ack_no_tstamp(tp, seq_rtt, flag); | 1937 | tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); |
| 2020 | } | 1938 | } |
| 2021 | 1939 | ||
| 2022 | /* | 1940 | static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, |
| 2023 | * Compute congestion window to use. | 1941 | u32 in_flight, int good) |
| 2024 | * | ||
| 2025 | * This is from the implementation of BICTCP in | ||
| 2026 | * Lison-Xu, Kahaled Harfoush, and Injog Rhee. | ||
| 2027 | * "Binary Increase Congestion Control for Fast, Long Distance | ||
| 2028 | * Networks" in InfoComm 2004 | ||
| 2029 | * Available from: | ||
| 2030 | * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf | ||
| 2031 | * | ||
| 2032 | * Unless BIC is enabled and congestion window is large | ||
| 2033 | * this behaves the same as the original Reno. | ||
| 2034 | */ | ||
| 2035 | static inline __u32 bictcp_cwnd(struct tcp_sock *tp) | ||
| 2036 | { | ||
| 2037 | /* orignal Reno behaviour */ | ||
| 2038 | if (!tcp_is_bic(tp)) | ||
| 2039 | return tp->snd_cwnd; | ||
| 2040 | |||
| 2041 | if (tp->bictcp.last_cwnd == tp->snd_cwnd && | ||
| 2042 | (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5)) | ||
| 2043 | return tp->bictcp.cnt; | ||
| 2044 | |||
| 2045 | tp->bictcp.last_cwnd = tp->snd_cwnd; | ||
| 2046 | tp->bictcp.last_stamp = tcp_time_stamp; | ||
| 2047 | |||
| 2048 | /* start off normal */ | ||
| 2049 | if (tp->snd_cwnd <= sysctl_tcp_bic_low_window) | ||
| 2050 | tp->bictcp.cnt = tp->snd_cwnd; | ||
| 2051 | |||
| 2052 | /* binary increase */ | ||
| 2053 | else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) { | ||
| 2054 | __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd) | ||
| 2055 | / BICTCP_B; | ||
| 2056 | |||
| 2057 | if (dist > BICTCP_MAX_INCREMENT) | ||
| 2058 | /* linear increase */ | ||
| 2059 | tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; | ||
| 2060 | else if (dist <= 1U) | ||
| 2061 | /* binary search increase */ | ||
| 2062 | tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR | ||
| 2063 | / BICTCP_B; | ||
| 2064 | else | ||
| 2065 | /* binary search increase */ | ||
| 2066 | tp->bictcp.cnt = tp->snd_cwnd / dist; | ||
| 2067 | } else { | ||
| 2068 | /* slow start amd linear increase */ | ||
| 2069 | if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B) | ||
| 2070 | /* slow start */ | ||
| 2071 | tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR | ||
| 2072 | / BICTCP_B; | ||
| 2073 | else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd | ||
| 2074 | + BICTCP_MAX_INCREMENT*(BICTCP_B-1)) | ||
| 2075 | /* slow start */ | ||
| 2076 | tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1) | ||
| 2077 | / (tp->snd_cwnd-tp->bictcp.last_max_cwnd); | ||
| 2078 | else | ||
| 2079 | /* linear increase */ | ||
| 2080 | tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; | ||
| 2081 | } | ||
| 2082 | return tp->bictcp.cnt; | ||
| 2083 | } | ||
| 2084 | |||
| 2085 | /* This is Jacobson's slow start and congestion avoidance. | ||
| 2086 | * SIGCOMM '88, p. 328. | ||
| 2087 | */ | ||
| 2088 | static inline void reno_cong_avoid(struct tcp_sock *tp) | ||
| 2089 | { | 1942 | { |
| 2090 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 1943 | tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); |
| 2091 | /* In "safe" area, increase. */ | ||
| 2092 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 2093 | tp->snd_cwnd++; | ||
| 2094 | } else { | ||
| 2095 | /* In dangerous area, increase slowly. | ||
| 2096 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | ||
| 2097 | */ | ||
| 2098 | if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) { | ||
| 2099 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 2100 | tp->snd_cwnd++; | ||
| 2101 | tp->snd_cwnd_cnt=0; | ||
| 2102 | } else | ||
| 2103 | tp->snd_cwnd_cnt++; | ||
| 2104 | } | ||
| 2105 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1944 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 2106 | } | 1945 | } |
| 2107 | 1946 | ||
| 2108 | /* This is based on the congestion detection/avoidance scheme described in | ||
| 2109 | * Lawrence S. Brakmo and Larry L. Peterson. | ||
| 2110 | * "TCP Vegas: End to end congestion avoidance on a global internet." | ||
| 2111 | * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, | ||
| 2112 | * October 1995. Available from: | ||
| 2113 | * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps | ||
| 2114 | * | ||
| 2115 | * See http://www.cs.arizona.edu/xkernel/ for their implementation. | ||
| 2116 | * The main aspects that distinguish this implementation from the | ||
| 2117 | * Arizona Vegas implementation are: | ||
| 2118 | * o We do not change the loss detection or recovery mechanisms of | ||
| 2119 | * Linux in any way. Linux already recovers from losses quite well, | ||
| 2120 | * using fine-grained timers, NewReno, and FACK. | ||
| 2121 | * o To avoid the performance penalty imposed by increasing cwnd | ||
| 2122 | * only every-other RTT during slow start, we increase during | ||
| 2123 | * every RTT during slow start, just like Reno. | ||
| 2124 | * o Largely to allow continuous cwnd growth during slow start, | ||
| 2125 | * we use the rate at which ACKs come back as the "actual" | ||
| 2126 | * rate, rather than the rate at which data is sent. | ||
| 2127 | * o To speed convergence to the right rate, we set the cwnd | ||
| 2128 | * to achieve the right ("actual") rate when we exit slow start. | ||
| 2129 | * o To filter out the noise caused by delayed ACKs, we use the | ||
| 2130 | * minimum RTT sample observed during the last RTT to calculate | ||
| 2131 | * the actual rate. | ||
| 2132 | * o When the sender re-starts from idle, it waits until it has | ||
| 2133 | * received ACKs for an entire flight of new data before making | ||
| 2134 | * a cwnd adjustment decision. The original Vegas implementation | ||
| 2135 | * assumed senders never went idle. | ||
| 2136 | */ | ||
| 2137 | static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) | ||
| 2138 | { | ||
| 2139 | /* The key players are v_beg_snd_una and v_beg_snd_nxt. | ||
| 2140 | * | ||
| 2141 | * These are so named because they represent the approximate values | ||
| 2142 | * of snd_una and snd_nxt at the beginning of the current RTT. More | ||
| 2143 | * precisely, they represent the amount of data sent during the RTT. | ||
| 2144 | * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, | ||
| 2145 | * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding | ||
| 2146 | * bytes of data have been ACKed during the course of the RTT, giving | ||
| 2147 | * an "actual" rate of: | ||
| 2148 | * | ||
| 2149 | * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) | ||
| 2150 | * | ||
| 2151 | * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, | ||
| 2152 | * because delayed ACKs can cover more than one segment, so they | ||
| 2153 | * don't line up nicely with the boundaries of RTTs. | ||
| 2154 | * | ||
| 2155 | * Another unfortunate fact of life is that delayed ACKs delay the | ||
| 2156 | * advance of the left edge of our send window, so that the number | ||
| 2157 | * of bytes we send in an RTT is often less than our cwnd will allow. | ||
| 2158 | * So we keep track of our cwnd separately, in v_beg_snd_cwnd. | ||
| 2159 | */ | ||
| 2160 | |||
| 2161 | if (after(ack, tp->vegas.beg_snd_nxt)) { | ||
| 2162 | /* Do the Vegas once-per-RTT cwnd adjustment. */ | ||
| 2163 | u32 old_wnd, old_snd_cwnd; | ||
| 2164 | |||
| 2165 | |||
| 2166 | /* Here old_wnd is essentially the window of data that was | ||
| 2167 | * sent during the previous RTT, and has all | ||
| 2168 | * been acknowledged in the course of the RTT that ended | ||
| 2169 | * with the ACK we just received. Likewise, old_snd_cwnd | ||
| 2170 | * is the cwnd during the previous RTT. | ||
| 2171 | */ | ||
| 2172 | old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / | ||
| 2173 | tp->mss_cache_std; | ||
| 2174 | old_snd_cwnd = tp->vegas.beg_snd_cwnd; | ||
| 2175 | |||
| 2176 | /* Save the extent of the current window so we can use this | ||
| 2177 | * at the end of the next RTT. | ||
| 2178 | */ | ||
| 2179 | tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt; | ||
| 2180 | tp->vegas.beg_snd_nxt = tp->snd_nxt; | ||
| 2181 | tp->vegas.beg_snd_cwnd = tp->snd_cwnd; | ||
| 2182 | |||
| 2183 | /* Take into account the current RTT sample too, to | ||
| 2184 | * decrease the impact of delayed acks. This double counts | ||
| 2185 | * this sample since we count it for the next window as well, | ||
| 2186 | * but that's not too awful, since we're taking the min, | ||
| 2187 | * rather than averaging. | ||
| 2188 | */ | ||
| 2189 | vegas_rtt_calc(tp, seq_rtt); | ||
| 2190 | |||
| 2191 | /* We do the Vegas calculations only if we got enough RTT | ||
| 2192 | * samples that we can be reasonably sure that we got | ||
| 2193 | * at least one RTT sample that wasn't from a delayed ACK. | ||
| 2194 | * If we only had 2 samples total, | ||
| 2195 | * then that means we're getting only 1 ACK per RTT, which | ||
| 2196 | * means they're almost certainly delayed ACKs. | ||
| 2197 | * If we have 3 samples, we should be OK. | ||
| 2198 | */ | ||
| 2199 | |||
| 2200 | if (tp->vegas.cntRTT <= 2) { | ||
| 2201 | /* We don't have enough RTT samples to do the Vegas | ||
| 2202 | * calculation, so we'll behave like Reno. | ||
| 2203 | */ | ||
| 2204 | if (tp->snd_cwnd > tp->snd_ssthresh) | ||
| 2205 | tp->snd_cwnd++; | ||
| 2206 | } else { | ||
| 2207 | u32 rtt, target_cwnd, diff; | ||
| 2208 | |||
| 2209 | /* We have enough RTT samples, so, using the Vegas | ||
| 2210 | * algorithm, we determine if we should increase or | ||
| 2211 | * decrease cwnd, and by how much. | ||
| 2212 | */ | ||
| 2213 | |||
| 2214 | /* Pluck out the RTT we are using for the Vegas | ||
| 2215 | * calculations. This is the min RTT seen during the | ||
| 2216 | * last RTT. Taking the min filters out the effects | ||
| 2217 | * of delayed ACKs, at the cost of noticing congestion | ||
| 2218 | * a bit later. | ||
| 2219 | */ | ||
| 2220 | rtt = tp->vegas.minRTT; | ||
| 2221 | |||
| 2222 | /* Calculate the cwnd we should have, if we weren't | ||
| 2223 | * going too fast. | ||
| 2224 | * | ||
| 2225 | * This is: | ||
| 2226 | * (actual rate in segments) * baseRTT | ||
| 2227 | * We keep it as a fixed point number with | ||
| 2228 | * V_PARAM_SHIFT bits to the right of the binary point. | ||
| 2229 | */ | ||
| 2230 | target_cwnd = ((old_wnd * tp->vegas.baseRTT) | ||
| 2231 | << V_PARAM_SHIFT) / rtt; | ||
| 2232 | |||
| 2233 | /* Calculate the difference between the window we had, | ||
| 2234 | * and the window we would like to have. This quantity | ||
| 2235 | * is the "Diff" from the Arizona Vegas papers. | ||
| 2236 | * | ||
| 2237 | * Again, this is a fixed point number with | ||
| 2238 | * V_PARAM_SHIFT bits to the right of the binary | ||
| 2239 | * point. | ||
| 2240 | */ | ||
| 2241 | diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; | ||
| 2242 | |||
| 2243 | if (tp->snd_cwnd < tp->snd_ssthresh) { | ||
| 2244 | /* Slow start. */ | ||
| 2245 | if (diff > sysctl_tcp_vegas_gamma) { | ||
| 2246 | /* Going too fast. Time to slow down | ||
| 2247 | * and switch to congestion avoidance. | ||
| 2248 | */ | ||
| 2249 | tp->snd_ssthresh = 2; | ||
| 2250 | |||
| 2251 | /* Set cwnd to match the actual rate | ||
| 2252 | * exactly: | ||
| 2253 | * cwnd = (actual rate) * baseRTT | ||
| 2254 | * Then we add 1 because the integer | ||
| 2255 | * truncation robs us of full link | ||
| 2256 | * utilization. | ||
| 2257 | */ | ||
| 2258 | tp->snd_cwnd = min(tp->snd_cwnd, | ||
| 2259 | (target_cwnd >> | ||
| 2260 | V_PARAM_SHIFT)+1); | ||
| 2261 | |||
| 2262 | } | ||
| 2263 | } else { | ||
| 2264 | /* Congestion avoidance. */ | ||
| 2265 | u32 next_snd_cwnd; | ||
| 2266 | |||
| 2267 | /* Figure out where we would like cwnd | ||
| 2268 | * to be. | ||
| 2269 | */ | ||
| 2270 | if (diff > sysctl_tcp_vegas_beta) { | ||
| 2271 | /* The old window was too fast, so | ||
| 2272 | * we slow down. | ||
| 2273 | */ | ||
| 2274 | next_snd_cwnd = old_snd_cwnd - 1; | ||
| 2275 | } else if (diff < sysctl_tcp_vegas_alpha) { | ||
| 2276 | /* We don't have enough extra packets | ||
| 2277 | * in the network, so speed up. | ||
| 2278 | */ | ||
| 2279 | next_snd_cwnd = old_snd_cwnd + 1; | ||
| 2280 | } else { | ||
| 2281 | /* Sending just as fast as we | ||
| 2282 | * should be. | ||
| 2283 | */ | ||
| 2284 | next_snd_cwnd = old_snd_cwnd; | ||
| 2285 | } | ||
| 2286 | |||
| 2287 | /* Adjust cwnd upward or downward, toward the | ||
| 2288 | * desired value. | ||
| 2289 | */ | ||
| 2290 | if (next_snd_cwnd > tp->snd_cwnd) | ||
| 2291 | tp->snd_cwnd++; | ||
| 2292 | else if (next_snd_cwnd < tp->snd_cwnd) | ||
| 2293 | tp->snd_cwnd--; | ||
| 2294 | } | ||
| 2295 | } | ||
| 2296 | |||
| 2297 | /* Wipe the slate clean for the next RTT. */ | ||
| 2298 | tp->vegas.cntRTT = 0; | ||
| 2299 | tp->vegas.minRTT = 0x7fffffff; | ||
| 2300 | } | ||
| 2301 | |||
| 2302 | /* The following code is executed for every ack we receive, | ||
| 2303 | * except for conditions checked in should_advance_cwnd() | ||
| 2304 | * before the call to tcp_cong_avoid(). Mainly this means that | ||
| 2305 | * we only execute this code if the ack actually acked some | ||
| 2306 | * data. | ||
| 2307 | */ | ||
| 2308 | |||
| 2309 | /* If we are in slow start, increase our cwnd in response to this ACK. | ||
| 2310 | * (If we are not in slow start then we are in congestion avoidance, | ||
| 2311 | * and adjust our congestion window only once per RTT. See the code | ||
| 2312 | * above.) | ||
| 2313 | */ | ||
| 2314 | if (tp->snd_cwnd <= tp->snd_ssthresh) | ||
| 2315 | tp->snd_cwnd++; | ||
| 2316 | |||
| 2317 | /* to keep cwnd from growing without bound */ | ||
| 2318 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
| 2319 | |||
| 2320 | /* Make sure that we are never so timid as to reduce our cwnd below | ||
| 2321 | * 2 MSS. | ||
| 2322 | * | ||
| 2323 | * Going below 2 MSS would risk huge delayed ACKs from our receiver. | ||
| 2324 | */ | ||
| 2325 | tp->snd_cwnd = max(tp->snd_cwnd, 2U); | ||
| 2326 | |||
| 2327 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
| 2328 | } | ||
| 2329 | |||
| 2330 | static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) | ||
| 2331 | { | ||
| 2332 | if (tcp_vegas_enabled(tp)) | ||
| 2333 | vegas_cong_avoid(tp, ack, seq_rtt); | ||
| 2334 | else | ||
| 2335 | reno_cong_avoid(tp); | ||
| 2336 | } | ||
| 2337 | |||
| 2338 | /* Restart timer after forward progress on connection. | 1947 | /* Restart timer after forward progress on connection. |
| 2339 | * RFC2988 recommends to restart timer to now+rto. | 1948 | * RFC2988 recommends to restart timer to now+rto. |
| 2340 | */ | 1949 | */ |
| @@ -2415,13 +2024,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, | |||
| 2415 | 2024 | ||
| 2416 | 2025 | ||
| 2417 | /* Remove acknowledged frames from the retransmission queue. */ | 2026 | /* Remove acknowledged frames from the retransmission queue. */ |
| 2418 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | 2027 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) |
| 2419 | { | 2028 | { |
| 2420 | struct tcp_sock *tp = tcp_sk(sk); | 2029 | struct tcp_sock *tp = tcp_sk(sk); |
| 2421 | struct sk_buff *skb; | 2030 | struct sk_buff *skb; |
| 2422 | __u32 now = tcp_time_stamp; | 2031 | __u32 now = tcp_time_stamp; |
| 2423 | int acked = 0; | 2032 | int acked = 0; |
| 2424 | __s32 seq_rtt = -1; | 2033 | __s32 seq_rtt = -1; |
| 2034 | struct timeval usnow; | ||
| 2035 | u32 pkts_acked = 0; | ||
| 2036 | |||
| 2037 | if (seq_usrtt) | ||
| 2038 | do_gettimeofday(&usnow); | ||
| 2425 | 2039 | ||
| 2426 | while ((skb = skb_peek(&sk->sk_write_queue)) && | 2040 | while ((skb = skb_peek(&sk->sk_write_queue)) && |
| 2427 | skb != sk->sk_send_head) { | 2041 | skb != sk->sk_send_head) { |
| @@ -2448,6 +2062,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
| 2448 | */ | 2062 | */ |
| 2449 | if (!(scb->flags & TCPCB_FLAG_SYN)) { | 2063 | if (!(scb->flags & TCPCB_FLAG_SYN)) { |
| 2450 | acked |= FLAG_DATA_ACKED; | 2064 | acked |= FLAG_DATA_ACKED; |
| 2065 | ++pkts_acked; | ||
| 2451 | } else { | 2066 | } else { |
| 2452 | acked |= FLAG_SYN_ACKED; | 2067 | acked |= FLAG_SYN_ACKED; |
| 2453 | tp->retrans_stamp = 0; | 2068 | tp->retrans_stamp = 0; |
| @@ -2461,6 +2076,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
| 2461 | seq_rtt = -1; | 2076 | seq_rtt = -1; |
| 2462 | } else if (seq_rtt < 0) | 2077 | } else if (seq_rtt < 0) |
| 2463 | seq_rtt = now - scb->when; | 2078 | seq_rtt = now - scb->when; |
| 2079 | if (seq_usrtt) | ||
| 2080 | *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 | ||
| 2081 | + (usnow.tv_usec - skb->stamp.tv_usec); | ||
| 2082 | |||
| 2464 | if (sacked & TCPCB_SACKED_ACKED) | 2083 | if (sacked & TCPCB_SACKED_ACKED) |
| 2465 | tp->sacked_out -= tcp_skb_pcount(skb); | 2084 | tp->sacked_out -= tcp_skb_pcount(skb); |
| 2466 | if (sacked & TCPCB_LOST) | 2085 | if (sacked & TCPCB_LOST) |
| @@ -2479,8 +2098,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
| 2479 | } | 2098 | } |
| 2480 | 2099 | ||
| 2481 | if (acked&FLAG_ACKED) { | 2100 | if (acked&FLAG_ACKED) { |
| 2482 | tcp_ack_update_rtt(tp, acked, seq_rtt); | 2101 | tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); |
| 2483 | tcp_ack_packets_out(sk, tp); | 2102 | tcp_ack_packets_out(sk, tp); |
| 2103 | |||
| 2104 | if (tp->ca_ops->pkts_acked) | ||
| 2105 | tp->ca_ops->pkts_acked(tp, pkts_acked); | ||
| 2484 | } | 2106 | } |
| 2485 | 2107 | ||
| 2486 | #if FASTRETRANS_DEBUG > 0 | 2108 | #if FASTRETRANS_DEBUG > 0 |
| @@ -2624,257 +2246,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) | |||
| 2624 | tp->frto_counter = (tp->frto_counter + 1) % 3; | 2246 | tp->frto_counter = (tp->frto_counter + 1) % 3; |
| 2625 | } | 2247 | } |
| 2626 | 2248 | ||
| 2627 | /* | ||
| 2628 | * TCP Westwood+ | ||
| 2629 | */ | ||
| 2630 | |||
| 2631 | /* | ||
| 2632 | * @init_westwood | ||
| 2633 | * This function initializes fields used in TCP Westwood+. We can't | ||
| 2634 | * get no information about RTTmin at this time so we simply set it to | ||
| 2635 | * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative | ||
| 2636 | * since in this way we're sure it will be updated in a consistent | ||
| 2637 | * way as soon as possible. It will reasonably happen within the first | ||
| 2638 | * RTT period of the connection lifetime. | ||
| 2639 | */ | ||
| 2640 | |||
| 2641 | static void init_westwood(struct sock *sk) | ||
| 2642 | { | ||
| 2643 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2644 | |||
| 2645 | tp->westwood.bw_ns_est = 0; | ||
| 2646 | tp->westwood.bw_est = 0; | ||
| 2647 | tp->westwood.accounted = 0; | ||
| 2648 | tp->westwood.cumul_ack = 0; | ||
| 2649 | tp->westwood.rtt_win_sx = tcp_time_stamp; | ||
| 2650 | tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT; | ||
| 2651 | tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT; | ||
| 2652 | tp->westwood.snd_una = tp->snd_una; | ||
| 2653 | } | ||
| 2654 | |||
| 2655 | /* | ||
| 2656 | * @westwood_do_filter | ||
| 2657 | * Low-pass filter. Implemented using constant coeffients. | ||
| 2658 | */ | ||
| 2659 | |||
| 2660 | static inline __u32 westwood_do_filter(__u32 a, __u32 b) | ||
| 2661 | { | ||
| 2662 | return (((7 * a) + b) >> 3); | ||
| 2663 | } | ||
| 2664 | |||
| 2665 | static void westwood_filter(struct sock *sk, __u32 delta) | ||
| 2666 | { | ||
| 2667 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2668 | |||
| 2669 | tp->westwood.bw_ns_est = | ||
| 2670 | westwood_do_filter(tp->westwood.bw_ns_est, | ||
| 2671 | tp->westwood.bk / delta); | ||
| 2672 | tp->westwood.bw_est = | ||
| 2673 | westwood_do_filter(tp->westwood.bw_est, | ||
| 2674 | tp->westwood.bw_ns_est); | ||
| 2675 | } | ||
| 2676 | |||
| 2677 | /* | ||
| 2678 | * @westwood_update_rttmin | ||
| 2679 | * It is used to update RTTmin. In this case we MUST NOT use | ||
| 2680 | * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN! | ||
| 2681 | */ | ||
| 2682 | |||
| 2683 | static inline __u32 westwood_update_rttmin(const struct sock *sk) | ||
| 2684 | { | ||
| 2685 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 2686 | __u32 rttmin = tp->westwood.rtt_min; | ||
| 2687 | |||
| 2688 | if (tp->westwood.rtt != 0 && | ||
| 2689 | (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin)) | ||
| 2690 | rttmin = tp->westwood.rtt; | ||
| 2691 | |||
| 2692 | return rttmin; | ||
| 2693 | } | ||
| 2694 | |||
| 2695 | /* | ||
| 2696 | * @westwood_acked | ||
| 2697 | * Evaluate increases for dk. | ||
| 2698 | */ | ||
| 2699 | |||
| 2700 | static inline __u32 westwood_acked(const struct sock *sk) | ||
| 2701 | { | ||
| 2702 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 2703 | |||
| 2704 | return tp->snd_una - tp->westwood.snd_una; | ||
| 2705 | } | ||
| 2706 | |||
| 2707 | /* | ||
| 2708 | * @westwood_new_window | ||
| 2709 | * It evaluates if we are receiving data inside the same RTT window as | ||
| 2710 | * when we started. | ||
| 2711 | * Return value: | ||
| 2712 | * It returns 0 if we are still evaluating samples in the same RTT | ||
| 2713 | * window, 1 if the sample has to be considered in the next window. | ||
| 2714 | */ | ||
| 2715 | |||
| 2716 | static int westwood_new_window(const struct sock *sk) | ||
| 2717 | { | ||
| 2718 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 2719 | __u32 left_bound; | ||
| 2720 | __u32 rtt; | ||
| 2721 | int ret = 0; | ||
| 2722 | |||
| 2723 | left_bound = tp->westwood.rtt_win_sx; | ||
| 2724 | rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN); | ||
| 2725 | |||
| 2726 | /* | ||
| 2727 | * A RTT-window has passed. Be careful since if RTT is less than | ||
| 2728 | * 50ms we don't filter but we continue 'building the sample'. | ||
| 2729 | * This minimum limit was choosen since an estimation on small | ||
| 2730 | * time intervals is better to avoid... | ||
| 2731 | * Obvioulsy on a LAN we reasonably will always have | ||
| 2732 | * right_bound = left_bound + WESTWOOD_RTT_MIN | ||
| 2733 | */ | ||
| 2734 | |||
| 2735 | if ((left_bound + rtt) < tcp_time_stamp) | ||
| 2736 | ret = 1; | ||
| 2737 | |||
| 2738 | return ret; | ||
| 2739 | } | ||
| 2740 | |||
| 2741 | /* | ||
| 2742 | * @westwood_update_window | ||
| 2743 | * It updates RTT evaluation window if it is the right moment to do | ||
| 2744 | * it. If so it calls filter for evaluating bandwidth. | ||
| 2745 | */ | ||
| 2746 | |||
| 2747 | static void __westwood_update_window(struct sock *sk, __u32 now) | ||
| 2748 | { | ||
| 2749 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2750 | __u32 delta = now - tp->westwood.rtt_win_sx; | ||
| 2751 | |||
| 2752 | if (delta) { | ||
| 2753 | if (tp->westwood.rtt) | ||
| 2754 | westwood_filter(sk, delta); | ||
| 2755 | |||
| 2756 | tp->westwood.bk = 0; | ||
| 2757 | tp->westwood.rtt_win_sx = tcp_time_stamp; | ||
| 2758 | } | ||
| 2759 | } | ||
| 2760 | |||
| 2761 | |||
| 2762 | static void westwood_update_window(struct sock *sk, __u32 now) | ||
| 2763 | { | ||
| 2764 | if (westwood_new_window(sk)) | ||
| 2765 | __westwood_update_window(sk, now); | ||
| 2766 | } | ||
| 2767 | |||
| 2768 | /* | ||
| 2769 | * @__tcp_westwood_fast_bw | ||
| 2770 | * It is called when we are in fast path. In particular it is called when | ||
| 2771 | * header prediction is successfull. In such case infact update is | ||
| 2772 | * straight forward and doesn't need any particular care. | ||
| 2773 | */ | ||
| 2774 | |||
| 2775 | static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) | ||
| 2776 | { | ||
| 2777 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2778 | |||
| 2779 | westwood_update_window(sk, tcp_time_stamp); | ||
| 2780 | |||
| 2781 | tp->westwood.bk += westwood_acked(sk); | ||
| 2782 | tp->westwood.snd_una = tp->snd_una; | ||
| 2783 | tp->westwood.rtt_min = westwood_update_rttmin(sk); | ||
| 2784 | } | ||
| 2785 | |||
| 2786 | static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) | ||
| 2787 | { | ||
| 2788 | if (tcp_is_westwood(tcp_sk(sk))) | ||
| 2789 | __tcp_westwood_fast_bw(sk, skb); | ||
| 2790 | } | ||
| 2791 | |||
| 2792 | |||
| 2793 | /* | ||
| 2794 | * @westwood_dupack_update | ||
| 2795 | * It updates accounted and cumul_ack when receiving a dupack. | ||
| 2796 | */ | ||
| 2797 | |||
| 2798 | static void westwood_dupack_update(struct sock *sk) | ||
| 2799 | { | ||
| 2800 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2801 | |||
| 2802 | tp->westwood.accounted += tp->mss_cache_std; | ||
| 2803 | tp->westwood.cumul_ack = tp->mss_cache_std; | ||
| 2804 | } | ||
| 2805 | |||
| 2806 | static inline int westwood_may_change_cumul(struct tcp_sock *tp) | ||
| 2807 | { | ||
| 2808 | return (tp->westwood.cumul_ack > tp->mss_cache_std); | ||
| 2809 | } | ||
| 2810 | |||
| 2811 | static inline void westwood_partial_update(struct tcp_sock *tp) | ||
| 2812 | { | ||
| 2813 | tp->westwood.accounted -= tp->westwood.cumul_ack; | ||
| 2814 | tp->westwood.cumul_ack = tp->mss_cache_std; | ||
| 2815 | } | ||
| 2816 | |||
| 2817 | static inline void westwood_complete_update(struct tcp_sock *tp) | ||
| 2818 | { | ||
| 2819 | tp->westwood.cumul_ack -= tp->westwood.accounted; | ||
| 2820 | tp->westwood.accounted = 0; | ||
| 2821 | } | ||
| 2822 | |||
| 2823 | /* | ||
| 2824 | * @westwood_acked_count | ||
| 2825 | * This function evaluates cumul_ack for evaluating dk in case of | ||
| 2826 | * delayed or partial acks. | ||
| 2827 | */ | ||
| 2828 | |||
| 2829 | static inline __u32 westwood_acked_count(struct sock *sk) | ||
| 2830 | { | ||
| 2831 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2832 | |||
| 2833 | tp->westwood.cumul_ack = westwood_acked(sk); | ||
| 2834 | |||
| 2835 | /* If cumul_ack is 0 this is a dupack since it's not moving | ||
| 2836 | * tp->snd_una. | ||
| 2837 | */ | ||
| 2838 | if (!(tp->westwood.cumul_ack)) | ||
| 2839 | westwood_dupack_update(sk); | ||
| 2840 | |||
| 2841 | if (westwood_may_change_cumul(tp)) { | ||
| 2842 | /* Partial or delayed ack */ | ||
| 2843 | if (tp->westwood.accounted >= tp->westwood.cumul_ack) | ||
| 2844 | westwood_partial_update(tp); | ||
| 2845 | else | ||
| 2846 | westwood_complete_update(tp); | ||
| 2847 | } | ||
| 2848 | |||
| 2849 | tp->westwood.snd_una = tp->snd_una; | ||
| 2850 | |||
| 2851 | return tp->westwood.cumul_ack; | ||
| 2852 | } | ||
| 2853 | |||
| 2854 | |||
| 2855 | /* | ||
| 2856 | * @__tcp_westwood_slow_bw | ||
| 2857 | * It is called when something is going wrong..even if there could | ||
| 2858 | * be no problems! Infact a simple delayed packet may trigger a | ||
| 2859 | * dupack. But we need to be careful in such case. | ||
| 2860 | */ | ||
| 2861 | |||
| 2862 | static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) | ||
| 2863 | { | ||
| 2864 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2865 | |||
| 2866 | westwood_update_window(sk, tcp_time_stamp); | ||
| 2867 | |||
| 2868 | tp->westwood.bk += westwood_acked_count(sk); | ||
| 2869 | tp->westwood.rtt_min = westwood_update_rttmin(sk); | ||
| 2870 | } | ||
| 2871 | |||
| 2872 | static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) | ||
| 2873 | { | ||
| 2874 | if (tcp_is_westwood(tcp_sk(sk))) | ||
| 2875 | __tcp_westwood_slow_bw(sk, skb); | ||
| 2876 | } | ||
| 2877 | |||
| 2878 | /* This routine deals with incoming acks, but not outgoing ones. */ | 2249 | /* This routine deals with incoming acks, but not outgoing ones. */ |
| 2879 | static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | 2250 | static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) |
| 2880 | { | 2251 | { |
| @@ -2884,6 +2255,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2884 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 2255 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
| 2885 | u32 prior_in_flight; | 2256 | u32 prior_in_flight; |
| 2886 | s32 seq_rtt; | 2257 | s32 seq_rtt; |
| 2258 | s32 seq_usrtt = 0; | ||
| 2887 | int prior_packets; | 2259 | int prior_packets; |
| 2888 | 2260 | ||
| 2889 | /* If the ack is newer than sent or older than previous acks | 2261 | /* If the ack is newer than sent or older than previous acks |
| @@ -2902,9 +2274,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2902 | */ | 2274 | */ |
| 2903 | tcp_update_wl(tp, ack, ack_seq); | 2275 | tcp_update_wl(tp, ack, ack_seq); |
| 2904 | tp->snd_una = ack; | 2276 | tp->snd_una = ack; |
| 2905 | tcp_westwood_fast_bw(sk, skb); | ||
| 2906 | flag |= FLAG_WIN_UPDATE; | 2277 | flag |= FLAG_WIN_UPDATE; |
| 2907 | 2278 | ||
| 2279 | tcp_ca_event(tp, CA_EVENT_FAST_ACK); | ||
| 2280 | |||
| 2908 | NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); | 2281 | NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); |
| 2909 | } else { | 2282 | } else { |
| 2910 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) | 2283 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) |
| @@ -2920,7 +2293,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2920 | if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) | 2293 | if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) |
| 2921 | flag |= FLAG_ECE; | 2294 | flag |= FLAG_ECE; |
| 2922 | 2295 | ||
| 2923 | tcp_westwood_slow_bw(sk,skb); | 2296 | tcp_ca_event(tp, CA_EVENT_SLOW_ACK); |
| 2924 | } | 2297 | } |
| 2925 | 2298 | ||
| 2926 | /* We passed data and got it acked, remove any soft error | 2299 | /* We passed data and got it acked, remove any soft error |
| @@ -2935,22 +2308,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2935 | prior_in_flight = tcp_packets_in_flight(tp); | 2308 | prior_in_flight = tcp_packets_in_flight(tp); |
| 2936 | 2309 | ||
| 2937 | /* See if we can take anything off of the retransmit queue. */ | 2310 | /* See if we can take anything off of the retransmit queue. */ |
| 2938 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt); | 2311 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt, |
| 2312 | tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); | ||
| 2939 | 2313 | ||
| 2940 | if (tp->frto_counter) | 2314 | if (tp->frto_counter) |
| 2941 | tcp_process_frto(sk, prior_snd_una); | 2315 | tcp_process_frto(sk, prior_snd_una); |
| 2942 | 2316 | ||
| 2943 | if (tcp_ack_is_dubious(tp, flag)) { | 2317 | if (tcp_ack_is_dubious(tp, flag)) { |
| 2944 | /* Advanve CWND, if state allows this. */ | 2318 | /* Advanve CWND, if state allows this. */ |
| 2945 | if ((flag & FLAG_DATA_ACKED) && | 2319 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) |
| 2946 | (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && | 2320 | tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); |
| 2947 | tcp_may_raise_cwnd(tp, flag)) | ||
| 2948 | tcp_cong_avoid(tp, ack, seq_rtt); | ||
| 2949 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); | 2321 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); |
| 2950 | } else { | 2322 | } else { |
| 2951 | if ((flag & FLAG_DATA_ACKED) && | 2323 | if ((flag & FLAG_DATA_ACKED)) |
| 2952 | (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) | 2324 | tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); |
| 2953 | tcp_cong_avoid(tp, ack, seq_rtt); | ||
| 2954 | } | 2325 | } |
| 2955 | 2326 | ||
| 2956 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) | 2327 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) |
| @@ -4552,6 +3923,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4552 | 3923 | ||
| 4553 | tcp_init_metrics(sk); | 3924 | tcp_init_metrics(sk); |
| 4554 | 3925 | ||
| 3926 | tcp_init_congestion_control(tp); | ||
| 3927 | |||
| 4555 | /* Prevent spurious tcp_cwnd_restart() on first data | 3928 | /* Prevent spurious tcp_cwnd_restart() on first data |
| 4556 | * packet. | 3929 | * packet. |
| 4557 | */ | 3930 | */ |
| @@ -4708,9 +4081,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4708 | if(tp->af_specific->conn_request(sk, skb) < 0) | 4081 | if(tp->af_specific->conn_request(sk, skb) < 0) |
| 4709 | return 1; | 4082 | return 1; |
| 4710 | 4083 | ||
| 4711 | init_westwood(sk); | ||
| 4712 | init_bictcp(tp); | ||
| 4713 | |||
| 4714 | /* Now we have several options: In theory there is | 4084 | /* Now we have several options: In theory there is |
| 4715 | * nothing else in the frame. KA9Q has an option to | 4085 | * nothing else in the frame. KA9Q has an option to |
| 4716 | * send data with the syn, BSD accepts data with the | 4086 | * send data with the syn, BSD accepts data with the |
| @@ -4732,9 +4102,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4732 | goto discard; | 4102 | goto discard; |
| 4733 | 4103 | ||
| 4734 | case TCP_SYN_SENT: | 4104 | case TCP_SYN_SENT: |
| 4735 | init_westwood(sk); | ||
| 4736 | init_bictcp(tp); | ||
| 4737 | |||
| 4738 | queued = tcp_rcv_synsent_state_process(sk, skb, th, len); | 4105 | queued = tcp_rcv_synsent_state_process(sk, skb, th, len); |
| 4739 | if (queued >= 0) | 4106 | if (queued >= 0) |
| 4740 | return queued; | 4107 | return queued; |
| @@ -4816,7 +4183,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4816 | */ | 4183 | */ |
| 4817 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 4184 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
| 4818 | !tp->srtt) | 4185 | !tp->srtt) |
| 4819 | tcp_ack_saw_tstamp(tp, 0); | 4186 | tcp_ack_saw_tstamp(tp, 0, 0); |
| 4820 | 4187 | ||
| 4821 | if (tp->rx_opt.tstamp_ok) | 4188 | if (tp->rx_opt.tstamp_ok) |
| 4822 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 4189 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
| @@ -4828,6 +4195,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4828 | 4195 | ||
| 4829 | tcp_init_metrics(sk); | 4196 | tcp_init_metrics(sk); |
| 4830 | 4197 | ||
| 4198 | tcp_init_congestion_control(tp); | ||
| 4199 | |||
| 4831 | /* Prevent spurious tcp_cwnd_restart() on | 4200 | /* Prevent spurious tcp_cwnd_restart() on |
| 4832 | * first data packet. | 4201 | * first data packet. |
| 4833 | */ | 4202 | */ |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2d41d5d6ad19..9122814c13ad 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
| @@ -2048,6 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk) | |||
| 2048 | tp->mss_cache_std = tp->mss_cache = 536; | 2048 | tp->mss_cache_std = tp->mss_cache = 536; |
| 2049 | 2049 | ||
| 2050 | tp->reordering = sysctl_tcp_reordering; | 2050 | tp->reordering = sysctl_tcp_reordering; |
| 2051 | tp->ca_ops = &tcp_reno; | ||
| 2051 | 2052 | ||
| 2052 | sk->sk_state = TCP_CLOSE; | 2053 | sk->sk_state = TCP_CLOSE; |
| 2053 | 2054 | ||
| @@ -2070,6 +2071,8 @@ int tcp_v4_destroy_sock(struct sock *sk) | |||
| 2070 | 2071 | ||
| 2071 | tcp_clear_xmit_timers(sk); | 2072 | tcp_clear_xmit_timers(sk); |
| 2072 | 2073 | ||
| 2074 | tcp_cleanup_congestion_control(tp); | ||
| 2075 | |||
| 2073 | /* Cleanup up the write buffer. */ | 2076 | /* Cleanup up the write buffer. */ |
| 2074 | sk_stream_writequeue_purge(sk); | 2077 | sk_stream_writequeue_purge(sk); |
| 2075 | 2078 | ||
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b3943e7562f3..f42a284164b7 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
| @@ -774,6 +774,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
| 774 | newtp->frto_counter = 0; | 774 | newtp->frto_counter = 0; |
| 775 | newtp->frto_highmark = 0; | 775 | newtp->frto_highmark = 0; |
| 776 | 776 | ||
| 777 | newtp->ca_ops = &tcp_reno; | ||
| 778 | |||
| 777 | tcp_set_ca_state(newtp, TCP_CA_Open); | 779 | tcp_set_ca_state(newtp, TCP_CA_Open); |
| 778 | tcp_init_xmit_timers(newsk); | 780 | tcp_init_xmit_timers(newsk); |
| 779 | skb_queue_head_init(&newtp->out_of_order_queue); | 781 | skb_queue_head_init(&newtp->out_of_order_queue); |
| @@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
| 842 | if (newtp->ecn_flags&TCP_ECN_OK) | 844 | if (newtp->ecn_flags&TCP_ECN_OK) |
| 843 | sock_set_flag(newsk, SOCK_NO_LARGESEND); | 845 | sock_set_flag(newsk, SOCK_NO_LARGESEND); |
| 844 | 846 | ||
| 845 | tcp_ca_init(newtp); | ||
| 846 | |||
| 847 | TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); | 847 | TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); |
| 848 | } | 848 | } |
| 849 | return newsk; | 849 | return newsk; |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f17c6577e337..0e17c244875c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
| @@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) | |||
| 111 | u32 restart_cwnd = tcp_init_cwnd(tp, dst); | 111 | u32 restart_cwnd = tcp_init_cwnd(tp, dst); |
| 112 | u32 cwnd = tp->snd_cwnd; | 112 | u32 cwnd = tp->snd_cwnd; |
| 113 | 113 | ||
| 114 | if (tcp_is_vegas(tp)) | 114 | tcp_ca_event(tp, CA_EVENT_CWND_RESTART); |
| 115 | tcp_vegas_enable(tp); | ||
| 116 | 115 | ||
| 117 | tp->snd_ssthresh = tcp_current_ssthresh(tp); | 116 | tp->snd_ssthresh = tcp_current_ssthresh(tp); |
| 118 | restart_cwnd = min(restart_cwnd, cwnd); | 117 | restart_cwnd = min(restart_cwnd, cwnd); |
| @@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 280 | #define SYSCTL_FLAG_WSCALE 0x2 | 279 | #define SYSCTL_FLAG_WSCALE 0x2 |
| 281 | #define SYSCTL_FLAG_SACK 0x4 | 280 | #define SYSCTL_FLAG_SACK 0x4 |
| 282 | 281 | ||
| 282 | /* If congestion control is doing timestamping */ | ||
| 283 | if (tp->ca_ops->rtt_sample) | ||
| 284 | do_gettimeofday(&skb->stamp); | ||
| 285 | |||
| 283 | sysctl_flags = 0; | 286 | sysctl_flags = 0; |
| 284 | if (tcb->flags & TCPCB_FLAG_SYN) { | 287 | if (tcb->flags & TCPCB_FLAG_SYN) { |
| 285 | tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; | 288 | tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; |
| @@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 304 | (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); | 307 | (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); |
| 305 | } | 308 | } |
| 306 | 309 | ||
| 307 | /* | 310 | if (tcp_packets_in_flight(tp) == 0) |
| 308 | * If the connection is idle and we are restarting, | 311 | tcp_ca_event(tp, CA_EVENT_TX_START); |
| 309 | * then we don't want to do any Vegas calculations | ||
| 310 | * until we get fresh RTT samples. So when we | ||
| 311 | * restart, we reset our Vegas state to a clean | ||
| 312 | * slate. After we get acks for this flight of | ||
| 313 | * packets, _then_ we can make Vegas calculations | ||
| 314 | * again. | ||
| 315 | */ | ||
| 316 | if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0) | ||
| 317 | tcp_vegas_enable(tp); | ||
| 318 | 312 | ||
| 319 | th = (struct tcphdr *) skb_push(skb, tcp_header_size); | 313 | th = (struct tcphdr *) skb_push(skb, tcp_header_size); |
| 320 | skb->h.th = th; | 314 | skb->h.th = th; |
| @@ -521,6 +515,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) | |||
| 521 | * skbs, which it never sent before. --ANK | 515 | * skbs, which it never sent before. --ANK |
| 522 | */ | 516 | */ |
| 523 | TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; | 517 | TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; |
| 518 | buff->stamp = skb->stamp; | ||
| 524 | 519 | ||
| 525 | if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { | 520 | if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { |
| 526 | tp->lost_out -= tcp_skb_pcount(skb); | 521 | tp->lost_out -= tcp_skb_pcount(skb); |
| @@ -1449,7 +1444,6 @@ static inline void tcp_connect_init(struct sock *sk) | |||
| 1449 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); | 1444 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); |
| 1450 | tp->advmss = dst_metric(dst, RTAX_ADVMSS); | 1445 | tp->advmss = dst_metric(dst, RTAX_ADVMSS); |
| 1451 | tcp_initialize_rcv_mss(sk); | 1446 | tcp_initialize_rcv_mss(sk); |
| 1452 | tcp_ca_init(tp); | ||
| 1453 | 1447 | ||
| 1454 | tcp_select_initial_window(tcp_full_space(sk), | 1448 | tcp_select_initial_window(tcp_full_space(sk), |
| 1455 | tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), | 1449 | tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), |
| @@ -1503,7 +1497,6 @@ int tcp_connect(struct sock *sk) | |||
| 1503 | TCP_SKB_CB(buff)->end_seq = tp->write_seq; | 1497 | TCP_SKB_CB(buff)->end_seq = tp->write_seq; |
| 1504 | tp->snd_nxt = tp->write_seq; | 1498 | tp->snd_nxt = tp->write_seq; |
| 1505 | tp->pushed_seq = tp->write_seq; | 1499 | tp->pushed_seq = tp->write_seq; |
| 1506 | tcp_ca_init(tp); | ||
| 1507 | 1500 | ||
| 1508 | /* Send it off. */ | 1501 | /* Send it off. */ |
| 1509 | TCP_SKB_CB(buff)->when = tcp_time_stamp; | 1502 | TCP_SKB_CB(buff)->when = tcp_time_stamp; |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2414937f2a83..fce56039b0e9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
| @@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk) | |||
| 2025 | sk->sk_state = TCP_CLOSE; | 2025 | sk->sk_state = TCP_CLOSE; |
| 2026 | 2026 | ||
| 2027 | tp->af_specific = &ipv6_specific; | 2027 | tp->af_specific = &ipv6_specific; |
| 2028 | 2028 | tp->ca_ops = &tcp_reno; | |
| 2029 | sk->sk_write_space = sk_stream_write_space; | 2029 | sk->sk_write_space = sk_stream_write_space; |
| 2030 | sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); | 2030 | sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); |
| 2031 | 2031 | ||
