diff options
| -rw-r--r-- | Documentation/networking/ip-sysctl.txt | 56 | ||||
| -rw-r--r-- | Documentation/networking/tcp.txt | 69 | ||||
| -rw-r--r-- | include/linux/sysctl.h | 9 | ||||
| -rw-r--r-- | include/linux/tcp.h | 49 | ||||
| -rw-r--r-- | include/linux/tcp_diag.h | 4 | ||||
| -rw-r--r-- | include/net/tcp.h | 237 | ||||
| -rw-r--r-- | net/ipv4/Kconfig | 90 | ||||
| -rw-r--r-- | net/ipv4/Makefile | 10 | ||||
| -rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 114 | ||||
| -rw-r--r-- | net/ipv4/tcp.c | 2 | ||||
| -rw-r--r-- | net/ipv4/tcp_bic.c | 331 | ||||
| -rw-r--r-- | net/ipv4/tcp_cong.c | 195 | ||||
| -rw-r--r-- | net/ipv4/tcp_diag.c | 34 | ||||
| -rw-r--r-- | net/ipv4/tcp_highspeed.c | 181 | ||||
| -rw-r--r-- | net/ipv4/tcp_htcp.c | 289 | ||||
| -rw-r--r-- | net/ipv4/tcp_hybla.c | 187 | ||||
| -rw-r--r-- | net/ipv4/tcp_input.c | 737 | ||||
| -rw-r--r-- | net/ipv4/tcp_ipv4.c | 3 | ||||
| -rw-r--r-- | net/ipv4/tcp_minisocks.c | 4 | ||||
| -rw-r--r-- | net/ipv4/tcp_output.c | 23 | ||||
| -rw-r--r-- | net/ipv4/tcp_scalable.c | 68 | ||||
| -rw-r--r-- | net/ipv4/tcp_vegas.c | 411 | ||||
| -rw-r--r-- | net/ipv4/tcp_westwood.c | 259 | ||||
| -rw-r--r-- | net/ipv6/tcp_ipv6.c | 2 |
24 files changed, 2304 insertions, 1060 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index a2c893a7475d..ab65714d95fc 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt | |||
| @@ -304,57 +304,6 @@ tcp_low_latency - BOOLEAN | |||
| 304 | changed would be a Beowulf compute cluster. | 304 | changed would be a Beowulf compute cluster. |
| 305 | Default: 0 | 305 | Default: 0 |
| 306 | 306 | ||
| 307 | tcp_westwood - BOOLEAN | ||
| 308 | Enable TCP Westwood+ congestion control algorithm. | ||
| 309 | TCP Westwood+ is a sender-side only modification of the TCP Reno | ||
| 310 | protocol stack that optimizes the performance of TCP congestion | ||
| 311 | control. It is based on end-to-end bandwidth estimation to set | ||
| 312 | congestion window and slow start threshold after a congestion | ||
| 313 | episode. Using this estimation, TCP Westwood+ adaptively sets a | ||
| 314 | slow start threshold and a congestion window which takes into | ||
| 315 | account the bandwidth used at the time congestion is experienced. | ||
| 316 | TCP Westwood+ significantly increases fairness wrt TCP Reno in | ||
| 317 | wired networks and throughput over wireless links. | ||
| 318 | Default: 0 | ||
| 319 | |||
| 320 | tcp_vegas_cong_avoid - BOOLEAN | ||
| 321 | Enable TCP Vegas congestion avoidance algorithm. | ||
| 322 | TCP Vegas is a sender-side only change to TCP that anticipates | ||
| 323 | the onset of congestion by estimating the bandwidth. TCP Vegas | ||
| 324 | adjusts the sending rate by modifying the congestion | ||
| 325 | window. TCP Vegas should provide less packet loss, but it is | ||
| 326 | not as aggressive as TCP Reno. | ||
| 327 | Default:0 | ||
| 328 | |||
| 329 | tcp_bic - BOOLEAN | ||
| 330 | Enable BIC TCP congestion control algorithm. | ||
| 331 | BIC-TCP is a sender-side only change that ensures a linear RTT | ||
| 332 | fairness under large windows while offering both scalability and | ||
| 333 | bounded TCP-friendliness. The protocol combines two schemes | ||
| 334 | called additive increase and binary search increase. When the | ||
| 335 | congestion window is large, additive increase with a large | ||
| 336 | increment ensures linear RTT fairness as well as good | ||
| 337 | scalability. Under small congestion windows, binary search | ||
| 338 | increase provides TCP friendliness. | ||
| 339 | Default: 0 | ||
| 340 | |||
| 341 | tcp_bic_low_window - INTEGER | ||
| 342 | Sets the threshold window (in packets) where BIC TCP starts to | ||
| 343 | adjust the congestion window. Below this threshold BIC TCP behaves | ||
| 344 | the same as the default TCP Reno. | ||
| 345 | Default: 14 | ||
| 346 | |||
| 347 | tcp_bic_fast_convergence - BOOLEAN | ||
| 348 | Forces BIC TCP to more quickly respond to changes in congestion | ||
| 349 | window. Allows two flows sharing the same connection to converge | ||
| 350 | more rapidly. | ||
| 351 | Default: 1 | ||
| 352 | |||
| 353 | tcp_default_win_scale - INTEGER | ||
| 354 | Sets the minimum window scale TCP will negotiate for on all | ||
| 355 | conections. | ||
| 356 | Default: 7 | ||
| 357 | |||
| 358 | tcp_tso_win_divisor - INTEGER | 307 | tcp_tso_win_divisor - INTEGER |
| 359 | This allows control over what percentage of the congestion window | 308 | This allows control over what percentage of the congestion window |
| 360 | can be consumed by a single TSO frame. | 309 | can be consumed by a single TSO frame. |
| @@ -368,6 +317,11 @@ tcp_frto - BOOLEAN | |||
| 368 | where packet loss is typically due to random radio interference | 317 | where packet loss is typically due to random radio interference |
| 369 | rather than intermediate router congestion. | 318 | rather than intermediate router congestion. |
| 370 | 319 | ||
| 320 | tcp_congestion_control - STRING | ||
| 321 | Set the congestion control algorithm to be used for new | ||
| 322 | connections. The algorithm "reno" is always available, but | ||
| 323 | additional choices may be available based on kernel configuration. | ||
| 324 | |||
| 371 | somaxconn - INTEGER | 325 | somaxconn - INTEGER |
| 372 | Limit of socket listen() backlog, known in userspace as SOMAXCONN. | 326 | Limit of socket listen() backlog, known in userspace as SOMAXCONN. |
| 373 | Defaults to 128. See also tcp_max_syn_backlog for additional tuning | 327 | Defaults to 128. See also tcp_max_syn_backlog for additional tuning |
diff --git a/Documentation/networking/tcp.txt b/Documentation/networking/tcp.txt index 71749007091e..0fa300425575 100644 --- a/Documentation/networking/tcp.txt +++ b/Documentation/networking/tcp.txt | |||
| @@ -1,5 +1,72 @@ | |||
| 1 | How the new TCP output machine [nyi] works. | 1 | TCP protocol |
| 2 | ============ | ||
| 3 | |||
| 4 | Last updated: 21 June 2005 | ||
| 5 | |||
| 6 | Contents | ||
| 7 | ======== | ||
| 8 | |||
| 9 | - Congestion control | ||
| 10 | - How the new TCP output machine [nyi] works | ||
| 11 | |||
| 12 | Congestion control | ||
| 13 | ================== | ||
| 14 | |||
| 15 | The following variables are used in the tcp_sock for congestion control: | ||
| 16 | snd_cwnd The size of the congestion window | ||
| 17 | snd_ssthresh Slow start threshold. We are in slow start if | ||
| 18 | snd_cwnd is less than this. | ||
| 19 | snd_cwnd_cnt A counter used to slow down the rate of increase | ||
| 20 | once we exceed slow start threshold. | ||
| 21 | snd_cwnd_clamp This is the maximum size that snd_cwnd can grow to. | ||
| 22 | snd_cwnd_stamp Timestamp for when congestion window last validated. | ||
| 23 | snd_cwnd_used Used as a highwater mark for how much of the | ||
| 24 | congestion window is in use. It is used to adjust | ||
| 25 | snd_cwnd down when the link is limited by the | ||
| 26 | application rather than the network. | ||
| 27 | |||
| 28 | As of 2.6.13, Linux supports pluggable congestion control algorithms. | ||
| 29 | A congestion control mechanism can be registered through functions in | ||
| 30 | tcp_cong.c. The functions used by the congestion control mechanism are | ||
| 31 | registered via passing a tcp_congestion_ops struct to | ||
| 32 | tcp_register_congestion_control. As a minimum name, ssthresh, | ||
| 33 | cong_avoid, min_cwnd must be valid. | ||
| 2 | 34 | ||
| 35 | Private data for a congestion control mechanism is stored in tp->ca_priv. | ||
| 36 | tcp_ca(tp) returns a pointer to this space. This is preallocated space - it | ||
| 37 | is important to check the size of your private data will fit this space, or | ||
| 38 | alternatively space could be allocated elsewhere and a pointer to it could | ||
| 39 | be stored here. | ||
| 40 | |||
| 41 | There are three kinds of congestion control algorithms currently: The | ||
| 42 | simplest ones are derived from TCP reno (highspeed, scalable) and just | ||
| 43 | provide an alternative the congestion window calculation. More complex | ||
| 44 | ones like BIC try to look at other events to provide better | ||
| 45 | heuristics. There are also round trip time based algorithms like | ||
| 46 | Vegas and Westwood+. | ||
| 47 | |||
| 48 | Good TCP congestion control is a complex problem because the algorithm | ||
| 49 | needs to maintain fairness and performance. Please review current | ||
| 50 | research and RFC's before developing new modules. | ||
| 51 | |||
| 52 | The method that is used to determine which congestion control mechanism is | ||
| 53 | determined by the setting of the sysctl net.ipv4.tcp_congestion_control. | ||
| 54 | The default congestion control will be the last one registered (LIFO); | ||
| 55 | so if you built everything as modules. the default will be reno. If you | ||
| 56 | build with the default's from Kconfig, then BIC will be builtin (not a module) | ||
| 57 | and it will end up the default. | ||
| 58 | |||
| 59 | If you really want a particular default value then you will need | ||
| 60 | to set it with the sysctl. If you use a sysctl, the module will be autoloaded | ||
| 61 | if needed and you will get the expected protocol. If you ask for an | ||
| 62 | unknown congestion method, then the sysctl attempt will fail. | ||
| 63 | |||
| 64 | If you remove a tcp congestion control module, then you will get the next | ||
| 65 | available one. Since reno can not be built as a module, and can not be | ||
| 66 | deleted, it will always be available. | ||
| 67 | |||
| 68 | How the new TCP output machine [nyi] works. | ||
| 69 | =========================================== | ||
| 3 | 70 | ||
| 4 | Data is kept on a single queue. The skb->users flag tells us if the frame is | 71 | Data is kept on a single queue. The skb->users flag tells us if the frame is |
| 5 | one that has been queued already. To add a frame we throw it on the end. Ack | 72 | one that has been queued already. To add a frame we throw it on the end. Ack |
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 614e939c78a4..72965bfe6cfb 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h | |||
| @@ -333,21 +333,14 @@ enum | |||
| 333 | NET_TCP_FRTO=92, | 333 | NET_TCP_FRTO=92, |
| 334 | NET_TCP_LOW_LATENCY=93, | 334 | NET_TCP_LOW_LATENCY=93, |
| 335 | NET_IPV4_IPFRAG_SECRET_INTERVAL=94, | 335 | NET_IPV4_IPFRAG_SECRET_INTERVAL=94, |
| 336 | NET_TCP_WESTWOOD=95, | ||
| 337 | NET_IPV4_IGMP_MAX_MSF=96, | 336 | NET_IPV4_IGMP_MAX_MSF=96, |
| 338 | NET_TCP_NO_METRICS_SAVE=97, | 337 | NET_TCP_NO_METRICS_SAVE=97, |
| 339 | NET_TCP_VEGAS=98, | ||
| 340 | NET_TCP_VEGAS_ALPHA=99, | ||
| 341 | NET_TCP_VEGAS_BETA=100, | ||
| 342 | NET_TCP_VEGAS_GAMMA=101, | ||
| 343 | NET_TCP_BIC=102, | ||
| 344 | NET_TCP_BIC_FAST_CONVERGENCE=103, | ||
| 345 | NET_TCP_BIC_LOW_WINDOW=104, | ||
| 346 | NET_TCP_DEFAULT_WIN_SCALE=105, | 338 | NET_TCP_DEFAULT_WIN_SCALE=105, |
| 347 | NET_TCP_MODERATE_RCVBUF=106, | 339 | NET_TCP_MODERATE_RCVBUF=106, |
| 348 | NET_TCP_TSO_WIN_DIVISOR=107, | 340 | NET_TCP_TSO_WIN_DIVISOR=107, |
| 349 | NET_TCP_BIC_BETA=108, | 341 | NET_TCP_BIC_BETA=108, |
| 350 | NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, | 342 | NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, |
| 343 | NET_TCP_CONG_CONTROL=110, | ||
| 351 | }; | 344 | }; |
| 352 | 345 | ||
| 353 | enum { | 346 | enum { |
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 97a7c9e03df5..3ea75dd6640a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h | |||
| @@ -203,13 +203,6 @@ struct tcp_sack_block { | |||
| 203 | __u32 end_seq; | 203 | __u32 end_seq; |
| 204 | }; | 204 | }; |
| 205 | 205 | ||
| 206 | enum tcp_congestion_algo { | ||
| 207 | TCP_RENO=0, | ||
| 208 | TCP_VEGAS, | ||
| 209 | TCP_WESTWOOD, | ||
| 210 | TCP_BIC, | ||
| 211 | }; | ||
| 212 | |||
| 213 | struct tcp_options_received { | 206 | struct tcp_options_received { |
| 214 | /* PAWS/RTTM data */ | 207 | /* PAWS/RTTM data */ |
| 215 | long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ | 208 | long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ |
| @@ -305,7 +298,7 @@ struct tcp_sock { | |||
| 305 | __u8 reordering; /* Packet reordering metric. */ | 298 | __u8 reordering; /* Packet reordering metric. */ |
| 306 | __u8 frto_counter; /* Number of new acks after RTO */ | 299 | __u8 frto_counter; /* Number of new acks after RTO */ |
| 307 | 300 | ||
| 308 | __u8 adv_cong; /* Using Vegas, Westwood, or BIC */ | 301 | __u8 unused; |
| 309 | __u8 defer_accept; /* User waits for some data after accept() */ | 302 | __u8 defer_accept; /* User waits for some data after accept() */ |
| 310 | 303 | ||
| 311 | /* RTT measurement */ | 304 | /* RTT measurement */ |
| @@ -401,37 +394,10 @@ struct tcp_sock { | |||
| 401 | __u32 time; | 394 | __u32 time; |
| 402 | } rcvq_space; | 395 | } rcvq_space; |
| 403 | 396 | ||
| 404 | /* TCP Westwood structure */ | 397 | /* Pluggable TCP congestion control hook */ |
| 405 | struct { | 398 | struct tcp_congestion_ops *ca_ops; |
| 406 | __u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ | 399 | u32 ca_priv[16]; |
| 407 | __u32 bw_est; /* bandwidth estimate */ | 400 | #define TCP_CA_PRIV_SIZE (16*sizeof(u32)) |
| 408 | __u32 rtt_win_sx; /* here starts a new evaluation... */ | ||
| 409 | __u32 bk; | ||
| 410 | __u32 snd_una; /* used for evaluating the number of acked bytes */ | ||
| 411 | __u32 cumul_ack; | ||
| 412 | __u32 accounted; | ||
| 413 | __u32 rtt; | ||
| 414 | __u32 rtt_min; /* minimum observed RTT */ | ||
| 415 | } westwood; | ||
| 416 | |||
| 417 | /* Vegas variables */ | ||
| 418 | struct { | ||
| 419 | __u32 beg_snd_nxt; /* right edge during last RTT */ | ||
| 420 | __u32 beg_snd_una; /* left edge during last RTT */ | ||
| 421 | __u32 beg_snd_cwnd; /* saves the size of the cwnd */ | ||
| 422 | __u8 doing_vegas_now;/* if true, do vegas for this RTT */ | ||
| 423 | __u16 cntRTT; /* # of RTTs measured within last RTT */ | ||
| 424 | __u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ | ||
| 425 | __u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ | ||
| 426 | } vegas; | ||
| 427 | |||
| 428 | /* BI TCP Parameters */ | ||
| 429 | struct { | ||
| 430 | __u32 cnt; /* increase cwnd by 1 after this number of ACKs */ | ||
| 431 | __u32 last_max_cwnd; /* last maximium snd_cwnd */ | ||
| 432 | __u32 last_cwnd; /* the last snd_cwnd */ | ||
| 433 | __u32 last_stamp; /* time when updated last_cwnd */ | ||
| 434 | } bictcp; | ||
| 435 | }; | 401 | }; |
| 436 | 402 | ||
| 437 | static inline struct tcp_sock *tcp_sk(const struct sock *sk) | 403 | static inline struct tcp_sock *tcp_sk(const struct sock *sk) |
| @@ -439,6 +405,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk) | |||
| 439 | return (struct tcp_sock *)sk; | 405 | return (struct tcp_sock *)sk; |
| 440 | } | 406 | } |
| 441 | 407 | ||
| 408 | static inline void *tcp_ca(const struct tcp_sock *tp) | ||
| 409 | { | ||
| 410 | return (void *) tp->ca_priv; | ||
| 411 | } | ||
| 412 | |||
| 442 | #endif | 413 | #endif |
| 443 | 414 | ||
| 444 | #endif /* _LINUX_TCP_H */ | 415 | #endif /* _LINUX_TCP_H */ |
diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h index ceee962e1d15..7a5996743946 100644 --- a/include/linux/tcp_diag.h +++ b/include/linux/tcp_diag.h | |||
| @@ -99,9 +99,10 @@ enum | |||
| 99 | TCPDIAG_MEMINFO, | 99 | TCPDIAG_MEMINFO, |
| 100 | TCPDIAG_INFO, | 100 | TCPDIAG_INFO, |
| 101 | TCPDIAG_VEGASINFO, | 101 | TCPDIAG_VEGASINFO, |
| 102 | TCPDIAG_CONG, | ||
| 102 | }; | 103 | }; |
| 103 | 104 | ||
| 104 | #define TCPDIAG_MAX TCPDIAG_VEGASINFO | 105 | #define TCPDIAG_MAX TCPDIAG_CONG |
| 105 | 106 | ||
| 106 | 107 | ||
| 107 | /* TCPDIAG_MEM */ | 108 | /* TCPDIAG_MEM */ |
| @@ -123,5 +124,4 @@ struct tcpvegas_info { | |||
| 123 | __u32 tcpv_minrtt; | 124 | __u32 tcpv_minrtt; |
| 124 | }; | 125 | }; |
| 125 | 126 | ||
| 126 | |||
| 127 | #endif /* _TCP_DIAG_H_ */ | 127 | #endif /* _TCP_DIAG_H_ */ |
diff --git a/include/net/tcp.h b/include/net/tcp.h index f730935b824a..e427cf35915c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h | |||
| @@ -505,25 +505,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) | |||
| 505 | #else | 505 | #else |
| 506 | # define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG) | 506 | # define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG) |
| 507 | #endif | 507 | #endif |
| 508 | |||
| 509 | #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation | ||
| 510 | * max_cwnd = snd_cwnd * beta | ||
| 511 | */ | ||
| 512 | #define BICTCP_MAX_INCREMENT 32 /* | ||
| 513 | * Limit on the amount of | ||
| 514 | * increment allowed during | ||
| 515 | * binary search. | ||
| 516 | */ | ||
| 517 | #define BICTCP_FUNC_OF_MIN_INCR 11 /* | ||
| 518 | * log(B/Smin)/log(B/(B-1))+1, | ||
| 519 | * Smin:min increment | ||
| 520 | * B:log factor | ||
| 521 | */ | ||
| 522 | #define BICTCP_B 4 /* | ||
| 523 | * In binary search, | ||
| 524 | * go to point (max+min)/N | ||
| 525 | */ | ||
| 526 | |||
| 527 | /* | 508 | /* |
| 528 | * TCP option | 509 | * TCP option |
| 529 | */ | 510 | */ |
| @@ -596,16 +577,7 @@ extern int sysctl_tcp_adv_win_scale; | |||
| 596 | extern int sysctl_tcp_tw_reuse; | 577 | extern int sysctl_tcp_tw_reuse; |
| 597 | extern int sysctl_tcp_frto; | 578 | extern int sysctl_tcp_frto; |
| 598 | extern int sysctl_tcp_low_latency; | 579 | extern int sysctl_tcp_low_latency; |
| 599 | extern int sysctl_tcp_westwood; | ||
| 600 | extern int sysctl_tcp_vegas_cong_avoid; | ||
| 601 | extern int sysctl_tcp_vegas_alpha; | ||
| 602 | extern int sysctl_tcp_vegas_beta; | ||
| 603 | extern int sysctl_tcp_vegas_gamma; | ||
| 604 | extern int sysctl_tcp_nometrics_save; | 580 | extern int sysctl_tcp_nometrics_save; |
| 605 | extern int sysctl_tcp_bic; | ||
| 606 | extern int sysctl_tcp_bic_fast_convergence; | ||
| 607 | extern int sysctl_tcp_bic_low_window; | ||
| 608 | extern int sysctl_tcp_bic_beta; | ||
| 609 | extern int sysctl_tcp_moderate_rcvbuf; | 581 | extern int sysctl_tcp_moderate_rcvbuf; |
| 610 | extern int sysctl_tcp_tso_win_divisor; | 582 | extern int sysctl_tcp_tso_win_divisor; |
| 611 | 583 | ||
| @@ -1136,6 +1108,80 @@ static inline void tcp_packets_out_dec(struct tcp_sock *tp, | |||
| 1136 | tp->packets_out -= tcp_skb_pcount(skb); | 1108 | tp->packets_out -= tcp_skb_pcount(skb); |
| 1137 | } | 1109 | } |
| 1138 | 1110 | ||
| 1111 | /* Events passed to congestion control interface */ | ||
| 1112 | enum tcp_ca_event { | ||
| 1113 | CA_EVENT_TX_START, /* first transmit when no packets in flight */ | ||
| 1114 | CA_EVENT_CWND_RESTART, /* congestion window restart */ | ||
| 1115 | CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ | ||
| 1116 | CA_EVENT_FRTO, /* fast recovery timeout */ | ||
| 1117 | CA_EVENT_LOSS, /* loss timeout */ | ||
| 1118 | CA_EVENT_FAST_ACK, /* in sequence ack */ | ||
| 1119 | CA_EVENT_SLOW_ACK, /* other ack */ | ||
| 1120 | }; | ||
| 1121 | |||
| 1122 | /* | ||
| 1123 | * Interface for adding new TCP congestion control handlers | ||
| 1124 | */ | ||
| 1125 | #define TCP_CA_NAME_MAX 16 | ||
| 1126 | struct tcp_congestion_ops { | ||
| 1127 | struct list_head list; | ||
| 1128 | |||
| 1129 | /* initialize private data (optional) */ | ||
| 1130 | void (*init)(struct tcp_sock *tp); | ||
| 1131 | /* cleanup private data (optional) */ | ||
| 1132 | void (*release)(struct tcp_sock *tp); | ||
| 1133 | |||
| 1134 | /* return slow start threshold (required) */ | ||
| 1135 | u32 (*ssthresh)(struct tcp_sock *tp); | ||
| 1136 | /* lower bound for congestion window (optional) */ | ||
| 1137 | u32 (*min_cwnd)(struct tcp_sock *tp); | ||
| 1138 | /* do new cwnd calculation (required) */ | ||
| 1139 | void (*cong_avoid)(struct tcp_sock *tp, u32 ack, | ||
| 1140 | u32 rtt, u32 in_flight, int good_ack); | ||
| 1141 | /* round trip time sample per acked packet (optional) */ | ||
| 1142 | void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt); | ||
| 1143 | /* call before changing ca_state (optional) */ | ||
| 1144 | void (*set_state)(struct tcp_sock *tp, u8 new_state); | ||
| 1145 | /* call when cwnd event occurs (optional) */ | ||
| 1146 | void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev); | ||
| 1147 | /* new value of cwnd after loss (optional) */ | ||
| 1148 | u32 (*undo_cwnd)(struct tcp_sock *tp); | ||
| 1149 | /* hook for packet ack accounting (optional) */ | ||
| 1150 | void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked); | ||
| 1151 | /* get info for tcp_diag (optional) */ | ||
| 1152 | void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb); | ||
| 1153 | |||
| 1154 | char name[TCP_CA_NAME_MAX]; | ||
| 1155 | struct module *owner; | ||
| 1156 | }; | ||
| 1157 | |||
| 1158 | extern int tcp_register_congestion_control(struct tcp_congestion_ops *type); | ||
| 1159 | extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); | ||
| 1160 | |||
| 1161 | extern void tcp_init_congestion_control(struct tcp_sock *tp); | ||
| 1162 | extern void tcp_cleanup_congestion_control(struct tcp_sock *tp); | ||
| 1163 | extern int tcp_set_default_congestion_control(const char *name); | ||
| 1164 | extern void tcp_get_default_congestion_control(char *name); | ||
| 1165 | |||
| 1166 | extern struct tcp_congestion_ops tcp_reno; | ||
| 1167 | extern u32 tcp_reno_ssthresh(struct tcp_sock *tp); | ||
| 1168 | extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, | ||
| 1169 | u32 rtt, u32 in_flight, int flag); | ||
| 1170 | extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp); | ||
| 1171 | |||
| 1172 | static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) | ||
| 1173 | { | ||
| 1174 | if (tp->ca_ops->set_state) | ||
| 1175 | tp->ca_ops->set_state(tp, ca_state); | ||
| 1176 | tp->ca_state = ca_state; | ||
| 1177 | } | ||
| 1178 | |||
| 1179 | static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event) | ||
| 1180 | { | ||
| 1181 | if (tp->ca_ops->cwnd_event) | ||
| 1182 | tp->ca_ops->cwnd_event(tp, event); | ||
| 1183 | } | ||
| 1184 | |||
| 1139 | /* This determines how many packets are "in the network" to the best | 1185 | /* This determines how many packets are "in the network" to the best |
| 1140 | * of our knowledge. In many cases it is conservative, but where | 1186 | * of our knowledge. In many cases it is conservative, but where |
| 1141 | * detailed information is available from the receiver (via SACK | 1187 | * detailed information is available from the receiver (via SACK |
| @@ -1155,91 +1201,6 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) | |||
| 1155 | return (tp->packets_out - tp->left_out + tp->retrans_out); | 1201 | return (tp->packets_out - tp->left_out + tp->retrans_out); |
| 1156 | } | 1202 | } |
| 1157 | 1203 | ||
| 1158 | /* | ||
| 1159 | * Which congestion algorithim is in use on the connection. | ||
| 1160 | */ | ||
| 1161 | #define tcp_is_vegas(__tp) ((__tp)->adv_cong == TCP_VEGAS) | ||
| 1162 | #define tcp_is_westwood(__tp) ((__tp)->adv_cong == TCP_WESTWOOD) | ||
| 1163 | #define tcp_is_bic(__tp) ((__tp)->adv_cong == TCP_BIC) | ||
| 1164 | |||
| 1165 | /* Recalculate snd_ssthresh, we want to set it to: | ||
| 1166 | * | ||
| 1167 | * Reno: | ||
| 1168 | * one half the current congestion window, but no | ||
| 1169 | * less than two segments | ||
| 1170 | * | ||
| 1171 | * BIC: | ||
| 1172 | * behave like Reno until low_window is reached, | ||
| 1173 | * then increase congestion window slowly | ||
| 1174 | */ | ||
| 1175 | static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp) | ||
| 1176 | { | ||
| 1177 | if (tcp_is_bic(tp)) { | ||
| 1178 | if (sysctl_tcp_bic_fast_convergence && | ||
| 1179 | tp->snd_cwnd < tp->bictcp.last_max_cwnd) | ||
| 1180 | tp->bictcp.last_max_cwnd = (tp->snd_cwnd * | ||
| 1181 | (BICTCP_BETA_SCALE | ||
| 1182 | + sysctl_tcp_bic_beta)) | ||
| 1183 | / (2 * BICTCP_BETA_SCALE); | ||
| 1184 | else | ||
| 1185 | tp->bictcp.last_max_cwnd = tp->snd_cwnd; | ||
| 1186 | |||
| 1187 | if (tp->snd_cwnd > sysctl_tcp_bic_low_window) | ||
| 1188 | return max((tp->snd_cwnd * sysctl_tcp_bic_beta) | ||
| 1189 | / BICTCP_BETA_SCALE, 2U); | ||
| 1190 | } | ||
| 1191 | |||
| 1192 | return max(tp->snd_cwnd >> 1U, 2U); | ||
| 1193 | } | ||
| 1194 | |||
| 1195 | /* Stop taking Vegas samples for now. */ | ||
| 1196 | #define tcp_vegas_disable(__tp) ((__tp)->vegas.doing_vegas_now = 0) | ||
| 1197 | |||
| 1198 | static inline void tcp_vegas_enable(struct tcp_sock *tp) | ||
| 1199 | { | ||
| 1200 | /* There are several situations when we must "re-start" Vegas: | ||
| 1201 | * | ||
| 1202 | * o when a connection is established | ||
| 1203 | * o after an RTO | ||
| 1204 | * o after fast recovery | ||
| 1205 | * o when we send a packet and there is no outstanding | ||
| 1206 | * unacknowledged data (restarting an idle connection) | ||
| 1207 | * | ||
| 1208 | * In these circumstances we cannot do a Vegas calculation at the | ||
| 1209 | * end of the first RTT, because any calculation we do is using | ||
| 1210 | * stale info -- both the saved cwnd and congestion feedback are | ||
| 1211 | * stale. | ||
| 1212 | * | ||
| 1213 | * Instead we must wait until the completion of an RTT during | ||
| 1214 | * which we actually receive ACKs. | ||
| 1215 | */ | ||
| 1216 | |||
| 1217 | /* Begin taking Vegas samples next time we send something. */ | ||
| 1218 | tp->vegas.doing_vegas_now = 1; | ||
| 1219 | |||
| 1220 | /* Set the beginning of the next send window. */ | ||
| 1221 | tp->vegas.beg_snd_nxt = tp->snd_nxt; | ||
| 1222 | |||
| 1223 | tp->vegas.cntRTT = 0; | ||
| 1224 | tp->vegas.minRTT = 0x7fffffff; | ||
| 1225 | } | ||
| 1226 | |||
| 1227 | /* Should we be taking Vegas samples right now? */ | ||
| 1228 | #define tcp_vegas_enabled(__tp) ((__tp)->vegas.doing_vegas_now) | ||
| 1229 | |||
| 1230 | extern void tcp_ca_init(struct tcp_sock *tp); | ||
| 1231 | |||
| 1232 | static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) | ||
| 1233 | { | ||
| 1234 | if (tcp_is_vegas(tp)) { | ||
| 1235 | if (ca_state == TCP_CA_Open) | ||
| 1236 | tcp_vegas_enable(tp); | ||
| 1237 | else | ||
| 1238 | tcp_vegas_disable(tp); | ||
| 1239 | } | ||
| 1240 | tp->ca_state = ca_state; | ||
| 1241 | } | ||
| 1242 | |||
| 1243 | /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. | 1204 | /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. |
| 1244 | * The exception is rate halving phase, when cwnd is decreasing towards | 1205 | * The exception is rate halving phase, when cwnd is decreasing towards |
| 1245 | * ssthresh. | 1206 | * ssthresh. |
| @@ -1288,7 +1249,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) | |||
| 1288 | static inline void __tcp_enter_cwr(struct tcp_sock *tp) | 1249 | static inline void __tcp_enter_cwr(struct tcp_sock *tp) |
| 1289 | { | 1250 | { |
| 1290 | tp->undo_marker = 0; | 1251 | tp->undo_marker = 0; |
| 1291 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1252 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
| 1292 | tp->snd_cwnd = min(tp->snd_cwnd, | 1253 | tp->snd_cwnd = min(tp->snd_cwnd, |
| 1293 | tcp_packets_in_flight(tp) + 1U); | 1254 | tcp_packets_in_flight(tp) + 1U); |
| 1294 | tp->snd_cwnd_cnt = 0; | 1255 | tp->snd_cwnd_cnt = 0; |
| @@ -1876,52 +1837,4 @@ struct tcp_iter_state { | |||
| 1876 | extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo); | 1837 | extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo); |
| 1877 | extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo); | 1838 | extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo); |
| 1878 | 1839 | ||
| 1879 | /* TCP Westwood functions and constants */ | ||
| 1880 | |||
| 1881 | #define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ | ||
| 1882 | #define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ | ||
| 1883 | |||
| 1884 | static inline void tcp_westwood_update_rtt(struct tcp_sock *tp, __u32 rtt_seq) | ||
| 1885 | { | ||
| 1886 | if (tcp_is_westwood(tp)) | ||
| 1887 | tp->westwood.rtt = rtt_seq; | ||
| 1888 | } | ||
| 1889 | |||
| 1890 | static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp) | ||
| 1891 | { | ||
| 1892 | return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) / | ||
| 1893 | (__u32) (tp->mss_cache_std), | ||
| 1894 | 2U); | ||
| 1895 | } | ||
| 1896 | |||
| 1897 | static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp) | ||
| 1898 | { | ||
| 1899 | return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0; | ||
| 1900 | } | ||
| 1901 | |||
| 1902 | static inline int tcp_westwood_ssthresh(struct tcp_sock *tp) | ||
| 1903 | { | ||
| 1904 | __u32 ssthresh = 0; | ||
| 1905 | |||
| 1906 | if (tcp_is_westwood(tp)) { | ||
| 1907 | ssthresh = __tcp_westwood_bw_rttmin(tp); | ||
| 1908 | if (ssthresh) | ||
| 1909 | tp->snd_ssthresh = ssthresh; | ||
| 1910 | } | ||
| 1911 | |||
| 1912 | return (ssthresh != 0); | ||
| 1913 | } | ||
| 1914 | |||
| 1915 | static inline int tcp_westwood_cwnd(struct tcp_sock *tp) | ||
| 1916 | { | ||
| 1917 | __u32 cwnd = 0; | ||
| 1918 | |||
| 1919 | if (tcp_is_westwood(tp)) { | ||
| 1920 | cwnd = __tcp_westwood_bw_rttmin(tp); | ||
| 1921 | if (cwnd) | ||
| 1922 | tp->snd_cwnd = cwnd; | ||
| 1923 | } | ||
| 1924 | |||
| 1925 | return (cwnd != 0); | ||
| 1926 | } | ||
| 1927 | #endif /* _TCP_H */ | 1840 | #endif /* _TCP_H */ |
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 567b03b1c349..690e88ba2484 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
| @@ -433,5 +433,95 @@ config IP_TCPDIAG | |||
| 433 | config IP_TCPDIAG_IPV6 | 433 | config IP_TCPDIAG_IPV6 |
| 434 | def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) | 434 | def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) |
| 435 | 435 | ||
| 436 | # TCP Reno is builtin (required as fallback) | ||
| 437 | menu "TCP congestion control" | ||
| 438 | depends on INET | ||
| 439 | |||
| 440 | config TCP_CONG_BIC | ||
| 441 | tristate "Binary Increase Congestion (BIC) control" | ||
| 442 | depends on INET | ||
| 443 | default y | ||
| 444 | ---help--- | ||
| 445 | BIC-TCP is a sender-side only change that ensures a linear RTT | ||
| 446 | fairness under large windows while offering both scalability and | ||
| 447 | bounded TCP-friendliness. The protocol combines two schemes | ||
| 448 | called additive increase and binary search increase. When the | ||
| 449 | congestion window is large, additive increase with a large | ||
| 450 | increment ensures linear RTT fairness as well as good | ||
| 451 | scalability. Under small congestion windows, binary search | ||
| 452 | increase provides TCP friendliness. | ||
| 453 | See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ | ||
| 454 | |||
| 455 | config TCP_CONG_WESTWOOD | ||
| 456 | tristate "TCP Westwood+" | ||
| 457 | depends on INET | ||
| 458 | default m | ||
| 459 | ---help--- | ||
| 460 | TCP Westwood+ is a sender-side only modification of the TCP Reno | ||
| 461 | protocol stack that optimizes the performance of TCP congestion | ||
| 462 | control. It is based on end-to-end bandwidth estimation to set | ||
| 463 | congestion window and slow start threshold after a congestion | ||
| 464 | episode. Using this estimation, TCP Westwood+ adaptively sets a | ||
| 465 | slow start threshold and a congestion window which takes into | ||
| 466 | account the bandwidth used at the time congestion is experienced. | ||
| 467 | TCP Westwood+ significantly increases fairness wrt TCP Reno in | ||
| 468 | wired networks and throughput over wireless links. | ||
| 469 | |||
| 470 | config TCP_CONG_HTCP | ||
| 471 | tristate "H-TCP" | ||
| 472 | depends on INET | ||
| 473 | default m | ||
| 474 | ---help--- | ||
| 475 | H-TCP is a send-side only modifications of the TCP Reno | ||
| 476 | protocol stack that optimizes the performance of TCP | ||
| 477 | congestion control for high speed network links. It uses a | ||
| 478 | modeswitch to change the alpha and beta parameters of TCP Reno | ||
| 479 | based on network conditions and in a way so as to be fair with | ||
| 480 | other Reno and H-TCP flows. | ||
| 481 | |||
| 482 | config TCP_CONG_HSTCP | ||
| 483 | tristate "High Speed TCP" | ||
| 484 | depends on INET && EXPERIMENTAL | ||
| 485 | default n | ||
| 486 | ---help--- | ||
| 487 | Sally Floyd's High Speed TCP (RFC 3649) congestion control. | ||
| 488 | A modification to TCP's congestion control mechanism for use | ||
| 489 | with large congestion windows. A table indicates how much to | ||
| 490 | increase the congestion window by when an ACK is received. | ||
| 491 | For more detail see http://www.icir.org/floyd/hstcp.html | ||
| 492 | |||
| 493 | config TCP_CONG_HYBLA | ||
| 494 | tristate "TCP-Hybla congestion control algorithm" | ||
| 495 | depends on INET && EXPERIMENTAL | ||
| 496 | default n | ||
| 497 | ---help--- | ||
| 498 | TCP-Hybla is a sender-side only change that eliminates penalization of | ||
| 499 | long-RTT, large-bandwidth connections, like when satellite legs are | ||
| 500 | involved, expecially when sharing a common bottleneck with normal | ||
| 501 | terrestrial connections. | ||
| 502 | |||
| 503 | config TCP_CONG_VEGAS | ||
| 504 | tristate "TCP Vegas" | ||
| 505 | depends on INET && EXPERIMENTAL | ||
| 506 | default n | ||
| 507 | ---help--- | ||
| 508 | TCP Vegas is a sender-side only change to TCP that anticipates | ||
| 509 | the onset of congestion by estimating the bandwidth. TCP Vegas | ||
| 510 | adjusts the sending rate by modifying the congestion | ||
| 511 | window. TCP Vegas should provide less packet loss, but it is | ||
| 512 | not as aggressive as TCP Reno. | ||
| 513 | |||
| 514 | config TCP_CONG_SCALABLE | ||
| 515 | tristate "Scalable TCP" | ||
| 516 | depends on INET && EXPERIMENTAL | ||
| 517 | default n | ||
| 518 | ---help--- | ||
| 519 | Scalable TCP is a sender-side only change to TCP which uses a | ||
| 520 | MIMD congestion control algorithm which has some nice scaling | ||
| 521 | properties, though is known to have fairness issues. | ||
| 522 | See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ | ||
| 523 | |||
| 524 | endmenu | ||
| 525 | |||
| 436 | source "net/ipv4/ipvs/Kconfig" | 526 | source "net/ipv4/ipvs/Kconfig" |
| 437 | 527 | ||
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 65d57d8e1add..5718cdb3a61e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
| @@ -5,7 +5,8 @@ | |||
| 5 | obj-y := utils.o route.o inetpeer.o protocol.o \ | 5 | obj-y := utils.o route.o inetpeer.o protocol.o \ |
| 6 | ip_input.o ip_fragment.o ip_forward.o ip_options.o \ | 6 | ip_input.o ip_fragment.o ip_forward.o ip_options.o \ |
| 7 | ip_output.o ip_sockglue.o \ | 7 | ip_output.o ip_sockglue.o \ |
| 8 | tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ | 8 | tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ |
| 9 | tcp_minisocks.o tcp_cong.o \ | ||
| 9 | datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ | 10 | datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ |
| 10 | sysctl_net_ipv4.o fib_frontend.o fib_semantics.o | 11 | sysctl_net_ipv4.o fib_frontend.o fib_semantics.o |
| 11 | 12 | ||
| @@ -30,6 +31,13 @@ obj-$(CONFIG_NETFILTER) += netfilter/ | |||
| 30 | obj-$(CONFIG_IP_VS) += ipvs/ | 31 | obj-$(CONFIG_IP_VS) += ipvs/ |
| 31 | obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o | 32 | obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o |
| 32 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o | 33 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o |
| 34 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o | ||
| 35 | obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o | ||
| 36 | obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o | ||
| 37 | obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o | ||
| 38 | obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o | ||
| 39 | obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o | ||
| 40 | obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o | ||
| 33 | 41 | ||
| 34 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ | 42 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ |
| 35 | xfrm4_output.o | 43 | xfrm4_output.o |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 23068bddbf0b..e32894532416 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
| @@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table, | |||
| 118 | return 1; | 118 | return 1; |
| 119 | } | 119 | } |
| 120 | 120 | ||
| 121 | static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp, | ||
| 122 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 123 | { | ||
| 124 | char val[TCP_CA_NAME_MAX]; | ||
| 125 | ctl_table tbl = { | ||
| 126 | .data = val, | ||
| 127 | .maxlen = TCP_CA_NAME_MAX, | ||
| 128 | }; | ||
| 129 | int ret; | ||
| 130 | |||
| 131 | tcp_get_default_congestion_control(val); | ||
| 132 | |||
| 133 | ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); | ||
| 134 | if (write && ret == 0) | ||
| 135 | ret = tcp_set_default_congestion_control(val); | ||
| 136 | return ret; | ||
| 137 | } | ||
| 138 | |||
| 139 | int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen, | ||
| 140 | void __user *oldval, size_t __user *oldlenp, | ||
| 141 | void __user *newval, size_t newlen, | ||
| 142 | void **context) | ||
| 143 | { | ||
| 144 | char val[TCP_CA_NAME_MAX]; | ||
| 145 | ctl_table tbl = { | ||
| 146 | .data = val, | ||
| 147 | .maxlen = TCP_CA_NAME_MAX, | ||
| 148 | }; | ||
| 149 | int ret; | ||
| 150 | |||
| 151 | tcp_get_default_congestion_control(val); | ||
| 152 | ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen, | ||
| 153 | context); | ||
| 154 | if (ret == 0 && newval && newlen) | ||
| 155 | ret = tcp_set_default_congestion_control(val); | ||
| 156 | return ret; | ||
| 157 | } | ||
| 158 | |||
| 159 | |||
| 121 | ctl_table ipv4_table[] = { | 160 | ctl_table ipv4_table[] = { |
| 122 | { | 161 | { |
| 123 | .ctl_name = NET_IPV4_TCP_TIMESTAMPS, | 162 | .ctl_name = NET_IPV4_TCP_TIMESTAMPS, |
| @@ -612,70 +651,6 @@ ctl_table ipv4_table[] = { | |||
| 612 | .proc_handler = &proc_dointvec, | 651 | .proc_handler = &proc_dointvec, |
| 613 | }, | 652 | }, |
| 614 | { | 653 | { |
| 615 | .ctl_name = NET_TCP_WESTWOOD, | ||
| 616 | .procname = "tcp_westwood", | ||
| 617 | .data = &sysctl_tcp_westwood, | ||
| 618 | .maxlen = sizeof(int), | ||
| 619 | .mode = 0644, | ||
| 620 | .proc_handler = &proc_dointvec, | ||
| 621 | }, | ||
| 622 | { | ||
| 623 | .ctl_name = NET_TCP_VEGAS, | ||
| 624 | .procname = "tcp_vegas_cong_avoid", | ||
| 625 | .data = &sysctl_tcp_vegas_cong_avoid, | ||
| 626 | .maxlen = sizeof(int), | ||
| 627 | .mode = 0644, | ||
| 628 | .proc_handler = &proc_dointvec, | ||
| 629 | }, | ||
| 630 | { | ||
| 631 | .ctl_name = NET_TCP_VEGAS_ALPHA, | ||
| 632 | .procname = "tcp_vegas_alpha", | ||
| 633 | .data = &sysctl_tcp_vegas_alpha, | ||
| 634 | .maxlen = sizeof(int), | ||
| 635 | .mode = 0644, | ||
| 636 | .proc_handler = &proc_dointvec, | ||
| 637 | }, | ||
| 638 | { | ||
| 639 | .ctl_name = NET_TCP_VEGAS_BETA, | ||
| 640 | .procname = "tcp_vegas_beta", | ||
| 641 | .data = &sysctl_tcp_vegas_beta, | ||
| 642 | .maxlen = sizeof(int), | ||
| 643 | .mode = 0644, | ||
| 644 | .proc_handler = &proc_dointvec, | ||
| 645 | }, | ||
| 646 | { | ||
| 647 | .ctl_name = NET_TCP_VEGAS_GAMMA, | ||
| 648 | .procname = "tcp_vegas_gamma", | ||
| 649 | .data = &sysctl_tcp_vegas_gamma, | ||
| 650 | .maxlen = sizeof(int), | ||
| 651 | .mode = 0644, | ||
| 652 | .proc_handler = &proc_dointvec, | ||
| 653 | }, | ||
| 654 | { | ||
| 655 | .ctl_name = NET_TCP_BIC, | ||
| 656 | .procname = "tcp_bic", | ||
| 657 | .data = &sysctl_tcp_bic, | ||
| 658 | .maxlen = sizeof(int), | ||
| 659 | .mode = 0644, | ||
| 660 | .proc_handler = &proc_dointvec, | ||
| 661 | }, | ||
| 662 | { | ||
| 663 | .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE, | ||
| 664 | .procname = "tcp_bic_fast_convergence", | ||
| 665 | .data = &sysctl_tcp_bic_fast_convergence, | ||
| 666 | .maxlen = sizeof(int), | ||
| 667 | .mode = 0644, | ||
| 668 | .proc_handler = &proc_dointvec, | ||
| 669 | }, | ||
| 670 | { | ||
| 671 | .ctl_name = NET_TCP_BIC_LOW_WINDOW, | ||
| 672 | .procname = "tcp_bic_low_window", | ||
| 673 | .data = &sysctl_tcp_bic_low_window, | ||
| 674 | .maxlen = sizeof(int), | ||
| 675 | .mode = 0644, | ||
| 676 | .proc_handler = &proc_dointvec, | ||
| 677 | }, | ||
| 678 | { | ||
| 679 | .ctl_name = NET_TCP_MODERATE_RCVBUF, | 654 | .ctl_name = NET_TCP_MODERATE_RCVBUF, |
| 680 | .procname = "tcp_moderate_rcvbuf", | 655 | .procname = "tcp_moderate_rcvbuf", |
| 681 | .data = &sysctl_tcp_moderate_rcvbuf, | 656 | .data = &sysctl_tcp_moderate_rcvbuf, |
| @@ -692,13 +667,14 @@ ctl_table ipv4_table[] = { | |||
| 692 | .proc_handler = &proc_dointvec, | 667 | .proc_handler = &proc_dointvec, |
| 693 | }, | 668 | }, |
| 694 | { | 669 | { |
| 695 | .ctl_name = NET_TCP_BIC_BETA, | 670 | .ctl_name = NET_TCP_CONG_CONTROL, |
| 696 | .procname = "tcp_bic_beta", | 671 | .procname = "tcp_congestion_control", |
| 697 | .data = &sysctl_tcp_bic_beta, | ||
| 698 | .maxlen = sizeof(int), | ||
| 699 | .mode = 0644, | 672 | .mode = 0644, |
| 700 | .proc_handler = &proc_dointvec, | 673 | .maxlen = TCP_CA_NAME_MAX, |
| 674 | .proc_handler = &proc_tcp_congestion_control, | ||
| 675 | .strategy = &sysctl_tcp_congestion_control, | ||
| 701 | }, | 676 | }, |
| 677 | |||
| 702 | { .ctl_name = 0 } | 678 | { .ctl_name = 0 } |
| 703 | }; | 679 | }; |
| 704 | 680 | ||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 674bbd8cfd36..f3dbc8dc1263 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
| @@ -2333,6 +2333,8 @@ void __init tcp_init(void) | |||
| 2333 | printk(KERN_INFO "TCP: Hash tables configured " | 2333 | printk(KERN_INFO "TCP: Hash tables configured " |
| 2334 | "(established %d bind %d)\n", | 2334 | "(established %d bind %d)\n", |
| 2335 | tcp_ehash_size << 1, tcp_bhash_size); | 2335 | tcp_ehash_size << 1, tcp_bhash_size); |
| 2336 | |||
| 2337 | tcp_register_congestion_control(&tcp_reno); | ||
| 2336 | } | 2338 | } |
| 2337 | 2339 | ||
| 2338 | EXPORT_SYMBOL(tcp_accept); | 2340 | EXPORT_SYMBOL(tcp_accept); |
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c new file mode 100644 index 000000000000..ec38d45d6649 --- /dev/null +++ b/net/ipv4/tcp_bic.c | |||
| @@ -0,0 +1,331 @@ | |||
| 1 | /* | ||
| 2 | * Binary Increase Congestion control for TCP | ||
| 3 | * | ||
| 4 | * This is from the implementation of BICTCP in | ||
| 5 | * Lison-Xu, Kahaled Harfoush, and Injong Rhee. | ||
| 6 | * "Binary Increase Congestion Control for Fast, Long Distance | ||
| 7 | * Networks" in InfoComm 2004 | ||
| 8 | * Available from: | ||
| 9 | * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf | ||
| 10 | * | ||
| 11 | * Unless BIC is enabled and congestion window is large | ||
| 12 | * this behaves the same as the original Reno. | ||
| 13 | */ | ||
| 14 | |||
| 15 | #include <linux/config.h> | ||
| 16 | #include <linux/mm.h> | ||
| 17 | #include <linux/module.h> | ||
| 18 | #include <net/tcp.h> | ||
| 19 | |||
| 20 | |||
| 21 | #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation | ||
| 22 | * max_cwnd = snd_cwnd * beta | ||
| 23 | */ | ||
| 24 | #define BICTCP_B 4 /* | ||
| 25 | * In binary search, | ||
| 26 | * go to point (max+min)/N | ||
| 27 | */ | ||
| 28 | |||
| 29 | static int fast_convergence = 1; | ||
| 30 | static int max_increment = 32; | ||
| 31 | static int low_window = 14; | ||
| 32 | static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ | ||
| 33 | static int low_utilization_threshold = 153; | ||
| 34 | static int low_utilization_period = 2; | ||
| 35 | static int initial_ssthresh = 100; | ||
| 36 | static int smooth_part = 20; | ||
| 37 | |||
| 38 | module_param(fast_convergence, int, 0644); | ||
| 39 | MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); | ||
| 40 | module_param(max_increment, int, 0644); | ||
| 41 | MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); | ||
| 42 | module_param(low_window, int, 0644); | ||
| 43 | MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); | ||
| 44 | module_param(beta, int, 0644); | ||
| 45 | MODULE_PARM_DESC(beta, "beta for multiplicative increase"); | ||
| 46 | module_param(low_utilization_threshold, int, 0644); | ||
| 47 | MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode"); | ||
| 48 | module_param(low_utilization_period, int, 0644); | ||
| 49 | MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)"); | ||
| 50 | module_param(initial_ssthresh, int, 0644); | ||
| 51 | MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); | ||
| 52 | module_param(smooth_part, int, 0644); | ||
| 53 | MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); | ||
| 54 | |||
| 55 | |||
| 56 | /* BIC TCP Parameters */ | ||
| 57 | struct bictcp { | ||
| 58 | u32 cnt; /* increase cwnd by 1 after ACKs */ | ||
| 59 | u32 last_max_cwnd; /* last maximum snd_cwnd */ | ||
| 60 | u32 loss_cwnd; /* congestion window at last loss */ | ||
| 61 | u32 last_cwnd; /* the last snd_cwnd */ | ||
| 62 | u32 last_time; /* time when updated last_cwnd */ | ||
| 63 | u32 delay_min; /* min delay */ | ||
| 64 | u32 delay_max; /* max delay */ | ||
| 65 | u32 last_delay; | ||
| 66 | u8 low_utilization;/* 0: high; 1: low */ | ||
| 67 | u32 low_utilization_start; /* starting time of low utilization detection*/ | ||
| 68 | u32 epoch_start; /* beginning of an epoch */ | ||
| 69 | #define ACK_RATIO_SHIFT 4 | ||
| 70 | u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ | ||
| 71 | }; | ||
| 72 | |||
| 73 | static inline void bictcp_reset(struct bictcp *ca) | ||
| 74 | { | ||
| 75 | ca->cnt = 0; | ||
| 76 | ca->last_max_cwnd = 0; | ||
| 77 | ca->loss_cwnd = 0; | ||
| 78 | ca->last_cwnd = 0; | ||
| 79 | ca->last_time = 0; | ||
| 80 | ca->delay_min = 0; | ||
| 81 | ca->delay_max = 0; | ||
| 82 | ca->last_delay = 0; | ||
| 83 | ca->low_utilization = 0; | ||
| 84 | ca->low_utilization_start = 0; | ||
| 85 | ca->epoch_start = 0; | ||
| 86 | ca->delayed_ack = 2 << ACK_RATIO_SHIFT; | ||
| 87 | } | ||
| 88 | |||
| 89 | static void bictcp_init(struct tcp_sock *tp) | ||
| 90 | { | ||
| 91 | bictcp_reset(tcp_ca(tp)); | ||
| 92 | if (initial_ssthresh) | ||
| 93 | tp->snd_ssthresh = initial_ssthresh; | ||
| 94 | } | ||
| 95 | |||
| 96 | /* | ||
| 97 | * Compute congestion window to use. | ||
| 98 | */ | ||
| 99 | static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | ||
| 100 | { | ||
| 101 | if (ca->last_cwnd == cwnd && | ||
| 102 | (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) | ||
| 103 | return; | ||
| 104 | |||
| 105 | ca->last_cwnd = cwnd; | ||
| 106 | ca->last_time = tcp_time_stamp; | ||
| 107 | |||
| 108 | if (ca->epoch_start == 0) /* record the beginning of an epoch */ | ||
| 109 | ca->epoch_start = tcp_time_stamp; | ||
| 110 | |||
| 111 | /* start off normal */ | ||
| 112 | if (cwnd <= low_window) { | ||
| 113 | ca->cnt = cwnd; | ||
| 114 | return; | ||
| 115 | } | ||
| 116 | |||
| 117 | /* binary increase */ | ||
| 118 | if (cwnd < ca->last_max_cwnd) { | ||
| 119 | __u32 dist = (ca->last_max_cwnd - cwnd) | ||
| 120 | / BICTCP_B; | ||
| 121 | |||
| 122 | if (dist > max_increment) | ||
| 123 | /* linear increase */ | ||
| 124 | ca->cnt = cwnd / max_increment; | ||
| 125 | else if (dist <= 1U) | ||
| 126 | /* binary search increase */ | ||
| 127 | ca->cnt = (cwnd * smooth_part) / BICTCP_B; | ||
| 128 | else | ||
| 129 | /* binary search increase */ | ||
| 130 | ca->cnt = cwnd / dist; | ||
| 131 | } else { | ||
| 132 | /* slow start AMD linear increase */ | ||
| 133 | if (cwnd < ca->last_max_cwnd + BICTCP_B) | ||
| 134 | /* slow start */ | ||
| 135 | ca->cnt = (cwnd * smooth_part) / BICTCP_B; | ||
| 136 | else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1)) | ||
| 137 | /* slow start */ | ||
| 138 | ca->cnt = (cwnd * (BICTCP_B-1)) | ||
| 139 | / cwnd-ca->last_max_cwnd; | ||
| 140 | else | ||
| 141 | /* linear increase */ | ||
| 142 | ca->cnt = cwnd / max_increment; | ||
| 143 | } | ||
| 144 | |||
| 145 | /* if in slow start or link utilization is very low */ | ||
| 146 | if ( ca->loss_cwnd == 0 || | ||
| 147 | (cwnd > ca->loss_cwnd && ca->low_utilization)) { | ||
| 148 | if (ca->cnt > 20) /* increase cwnd 5% per RTT */ | ||
| 149 | ca->cnt = 20; | ||
| 150 | } | ||
| 151 | |||
| 152 | ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; | ||
| 153 | if (ca->cnt == 0) /* cannot be zero */ | ||
| 154 | ca->cnt = 1; | ||
| 155 | } | ||
| 156 | |||
| 157 | |||
| 158 | /* Detect low utilization in congestion avoidance */ | ||
| 159 | static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) | ||
| 160 | { | ||
| 161 | struct bictcp *ca = tcp_ca(tp); | ||
| 162 | u32 dist, delay; | ||
| 163 | |||
| 164 | /* No time stamp */ | ||
| 165 | if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || | ||
| 166 | /* Discard delay samples right after fast recovery */ | ||
| 167 | tcp_time_stamp < ca->epoch_start + HZ || | ||
| 168 | /* this delay samples may not be accurate */ | ||
| 169 | flag == 0) { | ||
| 170 | ca->last_delay = 0; | ||
| 171 | goto notlow; | ||
| 172 | } | ||
| 173 | |||
| 174 | delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/ | ||
| 175 | ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | ||
| 176 | if (delay == 0) /* no previous delay sample */ | ||
| 177 | goto notlow; | ||
| 178 | |||
| 179 | /* first time call or link delay decreases */ | ||
| 180 | if (ca->delay_min == 0 || ca->delay_min > delay) { | ||
| 181 | ca->delay_min = ca->delay_max = delay; | ||
| 182 | goto notlow; | ||
| 183 | } | ||
| 184 | |||
| 185 | if (ca->delay_max < delay) | ||
| 186 | ca->delay_max = delay; | ||
| 187 | |||
| 188 | /* utilization is low, if avg delay < dist*threshold | ||
| 189 | for checking_period time */ | ||
| 190 | dist = ca->delay_max - ca->delay_min; | ||
| 191 | if (dist <= ca->delay_min>>6 || | ||
| 192 | tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10) | ||
| 193 | goto notlow; | ||
| 194 | |||
| 195 | if (ca->low_utilization_start == 0) { | ||
| 196 | ca->low_utilization = 0; | ||
| 197 | ca->low_utilization_start = tcp_time_stamp; | ||
| 198 | } else if ((s32)(tcp_time_stamp - ca->low_utilization_start) | ||
| 199 | > low_utilization_period*HZ) { | ||
| 200 | ca->low_utilization = 1; | ||
| 201 | } | ||
| 202 | |||
| 203 | return; | ||
| 204 | |||
| 205 | notlow: | ||
| 206 | ca->low_utilization = 0; | ||
| 207 | ca->low_utilization_start = 0; | ||
| 208 | |||
| 209 | } | ||
| 210 | |||
| 211 | static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, | ||
| 212 | u32 seq_rtt, u32 in_flight, int data_acked) | ||
| 213 | { | ||
| 214 | struct bictcp *ca = tcp_ca(tp); | ||
| 215 | |||
| 216 | bictcp_low_utilization(tp, data_acked); | ||
| 217 | |||
| 218 | if (in_flight < tp->snd_cwnd) | ||
| 219 | return; | ||
| 220 | |||
| 221 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | ||
| 222 | /* In "safe" area, increase. */ | ||
| 223 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 224 | tp->snd_cwnd++; | ||
| 225 | } else { | ||
| 226 | bictcp_update(ca, tp->snd_cwnd); | ||
| 227 | |||
| 228 | /* In dangerous area, increase slowly. | ||
| 229 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | ||
| 230 | */ | ||
| 231 | if (tp->snd_cwnd_cnt >= ca->cnt) { | ||
| 232 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 233 | tp->snd_cwnd++; | ||
| 234 | tp->snd_cwnd_cnt = 0; | ||
| 235 | } else | ||
| 236 | tp->snd_cwnd_cnt++; | ||
| 237 | } | ||
| 238 | |||
| 239 | } | ||
| 240 | |||
| 241 | /* | ||
| 242 | * behave like Reno until low_window is reached, | ||
| 243 | * then increase congestion window slowly | ||
| 244 | */ | ||
| 245 | static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) | ||
| 246 | { | ||
| 247 | struct bictcp *ca = tcp_ca(tp); | ||
| 248 | |||
| 249 | ca->epoch_start = 0; /* end of epoch */ | ||
| 250 | |||
| 251 | /* in case of wrong delay_max*/ | ||
| 252 | if (ca->delay_min > 0 && ca->delay_max > ca->delay_min) | ||
| 253 | ca->delay_max = ca->delay_min | ||
| 254 | + ((ca->delay_max - ca->delay_min)* 90) / 100; | ||
| 255 | |||
| 256 | /* Wmax and fast convergence */ | ||
| 257 | if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) | ||
| 258 | ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) | ||
| 259 | / (2 * BICTCP_BETA_SCALE); | ||
| 260 | else | ||
| 261 | ca->last_max_cwnd = tp->snd_cwnd; | ||
| 262 | |||
| 263 | ca->loss_cwnd = tp->snd_cwnd; | ||
| 264 | |||
| 265 | |||
| 266 | if (tp->snd_cwnd <= low_window) | ||
| 267 | return max(tp->snd_cwnd >> 1U, 2U); | ||
| 268 | else | ||
| 269 | return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); | ||
| 270 | } | ||
| 271 | |||
| 272 | static u32 bictcp_undo_cwnd(struct tcp_sock *tp) | ||
| 273 | { | ||
| 274 | struct bictcp *ca = tcp_ca(tp); | ||
| 275 | |||
| 276 | return max(tp->snd_cwnd, ca->last_max_cwnd); | ||
| 277 | } | ||
| 278 | |||
| 279 | static u32 bictcp_min_cwnd(struct tcp_sock *tp) | ||
| 280 | { | ||
| 281 | return tp->snd_ssthresh; | ||
| 282 | } | ||
| 283 | |||
| 284 | static void bictcp_state(struct tcp_sock *tp, u8 new_state) | ||
| 285 | { | ||
| 286 | if (new_state == TCP_CA_Loss) | ||
| 287 | bictcp_reset(tcp_ca(tp)); | ||
| 288 | } | ||
| 289 | |||
| 290 | /* Track delayed acknowledgement ratio using sliding window | ||
| 291 | * ratio = (15*ratio + sample) / 16 | ||
| 292 | */ | ||
| 293 | static void bictcp_acked(struct tcp_sock *tp, u32 cnt) | ||
| 294 | { | ||
| 295 | if (cnt > 0 && tp->ca_state == TCP_CA_Open) { | ||
| 296 | struct bictcp *ca = tcp_ca(tp); | ||
| 297 | cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; | ||
| 298 | ca->delayed_ack += cnt; | ||
| 299 | } | ||
| 300 | } | ||
| 301 | |||
| 302 | |||
| 303 | static struct tcp_congestion_ops bictcp = { | ||
| 304 | .init = bictcp_init, | ||
| 305 | .ssthresh = bictcp_recalc_ssthresh, | ||
| 306 | .cong_avoid = bictcp_cong_avoid, | ||
| 307 | .set_state = bictcp_state, | ||
| 308 | .undo_cwnd = bictcp_undo_cwnd, | ||
| 309 | .min_cwnd = bictcp_min_cwnd, | ||
| 310 | .pkts_acked = bictcp_acked, | ||
| 311 | .owner = THIS_MODULE, | ||
| 312 | .name = "bic", | ||
| 313 | }; | ||
| 314 | |||
| 315 | static int __init bictcp_register(void) | ||
| 316 | { | ||
| 317 | BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE); | ||
| 318 | return tcp_register_congestion_control(&bictcp); | ||
| 319 | } | ||
| 320 | |||
| 321 | static void __exit bictcp_unregister(void) | ||
| 322 | { | ||
| 323 | tcp_unregister_congestion_control(&bictcp); | ||
| 324 | } | ||
| 325 | |||
| 326 | module_init(bictcp_register); | ||
| 327 | module_exit(bictcp_unregister); | ||
| 328 | |||
| 329 | MODULE_AUTHOR("Stephen Hemminger"); | ||
| 330 | MODULE_LICENSE("GPL"); | ||
| 331 | MODULE_DESCRIPTION("BIC TCP"); | ||
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c new file mode 100644 index 000000000000..665394a63ae4 --- /dev/null +++ b/net/ipv4/tcp_cong.c | |||
| @@ -0,0 +1,195 @@ | |||
| 1 | /* | ||
| 2 | * Plugable TCP congestion control support and newReno | ||
| 3 | * congestion control. | ||
| 4 | * Based on ideas from I/O scheduler suport and Web100. | ||
| 5 | * | ||
| 6 | * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/config.h> | ||
| 10 | #include <linux/module.h> | ||
| 11 | #include <linux/mm.h> | ||
| 12 | #include <linux/types.h> | ||
| 13 | #include <linux/list.h> | ||
| 14 | #include <net/tcp.h> | ||
| 15 | |||
| 16 | static DEFINE_SPINLOCK(tcp_cong_list_lock); | ||
| 17 | static LIST_HEAD(tcp_cong_list); | ||
| 18 | |||
| 19 | /* Simple linear search, don't expect many entries! */ | ||
| 20 | static struct tcp_congestion_ops *tcp_ca_find(const char *name) | ||
| 21 | { | ||
| 22 | struct tcp_congestion_ops *e; | ||
| 23 | |||
| 24 | list_for_each_entry(e, &tcp_cong_list, list) { | ||
| 25 | if (strcmp(e->name, name) == 0) | ||
| 26 | return e; | ||
| 27 | } | ||
| 28 | |||
| 29 | return NULL; | ||
| 30 | } | ||
| 31 | |||
| 32 | /* | ||
| 33 | * Attach new congestion control algorthim to the list | ||
| 34 | * of available options. | ||
| 35 | */ | ||
| 36 | int tcp_register_congestion_control(struct tcp_congestion_ops *ca) | ||
| 37 | { | ||
| 38 | int ret = 0; | ||
| 39 | |||
| 40 | /* all algorithms must implement ssthresh and cong_avoid ops */ | ||
| 41 | if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { | ||
| 42 | printk(KERN_ERR "TCP %s does not implement required ops\n", | ||
| 43 | ca->name); | ||
| 44 | return -EINVAL; | ||
| 45 | } | ||
| 46 | |||
| 47 | spin_lock(&tcp_cong_list_lock); | ||
| 48 | if (tcp_ca_find(ca->name)) { | ||
| 49 | printk(KERN_NOTICE "TCP %s already registered\n", ca->name); | ||
| 50 | ret = -EEXIST; | ||
| 51 | } else { | ||
| 52 | list_add_rcu(&ca->list, &tcp_cong_list); | ||
| 53 | printk(KERN_INFO "TCP %s registered\n", ca->name); | ||
| 54 | } | ||
| 55 | spin_unlock(&tcp_cong_list_lock); | ||
| 56 | |||
| 57 | return ret; | ||
| 58 | } | ||
| 59 | EXPORT_SYMBOL_GPL(tcp_register_congestion_control); | ||
| 60 | |||
| 61 | /* | ||
| 62 | * Remove congestion control algorithm, called from | ||
| 63 | * the module's remove function. Module ref counts are used | ||
| 64 | * to ensure that this can't be done till all sockets using | ||
| 65 | * that method are closed. | ||
| 66 | */ | ||
| 67 | void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) | ||
| 68 | { | ||
| 69 | spin_lock(&tcp_cong_list_lock); | ||
| 70 | list_del_rcu(&ca->list); | ||
| 71 | spin_unlock(&tcp_cong_list_lock); | ||
| 72 | } | ||
| 73 | EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); | ||
| 74 | |||
| 75 | /* Assign choice of congestion control. */ | ||
| 76 | void tcp_init_congestion_control(struct tcp_sock *tp) | ||
| 77 | { | ||
| 78 | struct tcp_congestion_ops *ca; | ||
| 79 | |||
| 80 | rcu_read_lock(); | ||
| 81 | list_for_each_entry_rcu(ca, &tcp_cong_list, list) { | ||
| 82 | if (try_module_get(ca->owner)) { | ||
| 83 | tp->ca_ops = ca; | ||
| 84 | break; | ||
| 85 | } | ||
| 86 | |||
| 87 | } | ||
| 88 | rcu_read_unlock(); | ||
| 89 | |||
| 90 | if (tp->ca_ops->init) | ||
| 91 | tp->ca_ops->init(tp); | ||
| 92 | } | ||
| 93 | |||
| 94 | /* Manage refcounts on socket close. */ | ||
| 95 | void tcp_cleanup_congestion_control(struct tcp_sock *tp) | ||
| 96 | { | ||
| 97 | if (tp->ca_ops->release) | ||
| 98 | tp->ca_ops->release(tp); | ||
| 99 | module_put(tp->ca_ops->owner); | ||
| 100 | } | ||
| 101 | |||
| 102 | /* Used by sysctl to change default congestion control */ | ||
| 103 | int tcp_set_default_congestion_control(const char *name) | ||
| 104 | { | ||
| 105 | struct tcp_congestion_ops *ca; | ||
| 106 | int ret = -ENOENT; | ||
| 107 | |||
| 108 | spin_lock(&tcp_cong_list_lock); | ||
| 109 | ca = tcp_ca_find(name); | ||
| 110 | #ifdef CONFIG_KMOD | ||
| 111 | if (!ca) { | ||
| 112 | spin_unlock(&tcp_cong_list_lock); | ||
| 113 | |||
| 114 | request_module("tcp_%s", name); | ||
| 115 | spin_lock(&tcp_cong_list_lock); | ||
| 116 | ca = tcp_ca_find(name); | ||
| 117 | } | ||
| 118 | #endif | ||
| 119 | |||
| 120 | if (ca) { | ||
| 121 | list_move(&ca->list, &tcp_cong_list); | ||
| 122 | ret = 0; | ||
| 123 | } | ||
| 124 | spin_unlock(&tcp_cong_list_lock); | ||
| 125 | |||
| 126 | return ret; | ||
| 127 | } | ||
| 128 | |||
| 129 | /* Get current default congestion control */ | ||
| 130 | void tcp_get_default_congestion_control(char *name) | ||
| 131 | { | ||
| 132 | struct tcp_congestion_ops *ca; | ||
| 133 | /* We will always have reno... */ | ||
| 134 | BUG_ON(list_empty(&tcp_cong_list)); | ||
| 135 | |||
| 136 | rcu_read_lock(); | ||
| 137 | ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); | ||
| 138 | strncpy(name, ca->name, TCP_CA_NAME_MAX); | ||
| 139 | rcu_read_unlock(); | ||
| 140 | } | ||
| 141 | |||
| 142 | /* | ||
| 143 | * TCP Reno congestion control | ||
| 144 | * This is special case used for fallback as well. | ||
| 145 | */ | ||
| 146 | /* This is Jacobson's slow start and congestion avoidance. | ||
| 147 | * SIGCOMM '88, p. 328. | ||
| 148 | */ | ||
| 149 | void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, | ||
| 150 | int flag) | ||
| 151 | { | ||
| 152 | if (in_flight < tp->snd_cwnd) | ||
| 153 | return; | ||
| 154 | |||
| 155 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | ||
| 156 | /* In "safe" area, increase. */ | ||
| 157 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 158 | tp->snd_cwnd++; | ||
| 159 | } else { | ||
| 160 | /* In dangerous area, increase slowly. | ||
| 161 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | ||
| 162 | */ | ||
| 163 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | ||
| 164 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 165 | tp->snd_cwnd++; | ||
| 166 | tp->snd_cwnd_cnt = 0; | ||
| 167 | } else | ||
| 168 | tp->snd_cwnd_cnt++; | ||
| 169 | } | ||
| 170 | } | ||
| 171 | EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); | ||
| 172 | |||
| 173 | /* Slow start threshold is half the congestion window (min 2) */ | ||
| 174 | u32 tcp_reno_ssthresh(struct tcp_sock *tp) | ||
| 175 | { | ||
| 176 | return max(tp->snd_cwnd >> 1U, 2U); | ||
| 177 | } | ||
| 178 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); | ||
| 179 | |||
| 180 | /* Lower bound on congestion window. */ | ||
| 181 | u32 tcp_reno_min_cwnd(struct tcp_sock *tp) | ||
| 182 | { | ||
| 183 | return tp->snd_ssthresh/2; | ||
| 184 | } | ||
| 185 | EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); | ||
| 186 | |||
| 187 | struct tcp_congestion_ops tcp_reno = { | ||
| 188 | .name = "reno", | ||
| 189 | .owner = THIS_MODULE, | ||
| 190 | .ssthresh = tcp_reno_ssthresh, | ||
| 191 | .cong_avoid = tcp_reno_cong_avoid, | ||
| 192 | .min_cwnd = tcp_reno_min_cwnd, | ||
| 193 | }; | ||
| 194 | |||
| 195 | EXPORT_SYMBOL_GPL(tcp_reno); | ||
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 634befc07921..f66945cb158f 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c | |||
| @@ -42,15 +42,8 @@ struct tcpdiag_entry | |||
| 42 | 42 | ||
| 43 | static struct sock *tcpnl; | 43 | static struct sock *tcpnl; |
| 44 | 44 | ||
| 45 | |||
| 46 | #define TCPDIAG_PUT(skb, attrtype, attrlen) \ | 45 | #define TCPDIAG_PUT(skb, attrtype, attrlen) \ |
| 47 | ({ int rtalen = RTA_LENGTH(attrlen); \ | 46 | RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) |
| 48 | struct rtattr *rta; \ | ||
| 49 | if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \ | ||
| 50 | rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \ | ||
| 51 | rta->rta_type = attrtype; \ | ||
| 52 | rta->rta_len = rtalen; \ | ||
| 53 | RTA_DATA(rta); }) | ||
| 54 | 47 | ||
| 55 | static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, | 48 | static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, |
| 56 | int ext, u32 pid, u32 seq, u16 nlmsg_flags) | 49 | int ext, u32 pid, u32 seq, u16 nlmsg_flags) |
| @@ -61,7 +54,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, | |||
| 61 | struct nlmsghdr *nlh; | 54 | struct nlmsghdr *nlh; |
| 62 | struct tcp_info *info = NULL; | 55 | struct tcp_info *info = NULL; |
| 63 | struct tcpdiag_meminfo *minfo = NULL; | 56 | struct tcpdiag_meminfo *minfo = NULL; |
| 64 | struct tcpvegas_info *vinfo = NULL; | ||
| 65 | unsigned char *b = skb->tail; | 57 | unsigned char *b = skb->tail; |
| 66 | 58 | ||
| 67 | nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); | 59 | nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); |
| @@ -73,9 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, | |||
| 73 | if (ext & (1<<(TCPDIAG_INFO-1))) | 65 | if (ext & (1<<(TCPDIAG_INFO-1))) |
| 74 | info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); | 66 | info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); |
| 75 | 67 | ||
| 76 | if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) | 68 | if (ext & (1<<(TCPDIAG_CONG-1))) { |
| 77 | && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) | 69 | size_t len = strlen(tp->ca_ops->name); |
| 78 | vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); | 70 | strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1), |
| 71 | tp->ca_ops->name); | ||
| 72 | } | ||
| 79 | } | 73 | } |
| 80 | r->tcpdiag_family = sk->sk_family; | 74 | r->tcpdiag_family = sk->sk_family; |
| 81 | r->tcpdiag_state = sk->sk_state; | 75 | r->tcpdiag_state = sk->sk_state; |
| @@ -166,23 +160,13 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, | |||
| 166 | if (info) | 160 | if (info) |
| 167 | tcp_get_info(sk, info); | 161 | tcp_get_info(sk, info); |
| 168 | 162 | ||
| 169 | if (vinfo) { | 163 | if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info) |
| 170 | if (tcp_is_vegas(tp)) { | 164 | tp->ca_ops->get_info(tp, ext, skb); |
| 171 | vinfo->tcpv_enabled = tp->vegas.doing_vegas_now; | ||
| 172 | vinfo->tcpv_rttcnt = tp->vegas.cntRTT; | ||
| 173 | vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT); | ||
| 174 | vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT); | ||
| 175 | } else { | ||
| 176 | vinfo->tcpv_enabled = 0; | ||
| 177 | vinfo->tcpv_rttcnt = 0; | ||
| 178 | vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt); | ||
| 179 | vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min); | ||
| 180 | } | ||
| 181 | } | ||
| 182 | 165 | ||
| 183 | nlh->nlmsg_len = skb->tail - b; | 166 | nlh->nlmsg_len = skb->tail - b; |
| 184 | return skb->len; | 167 | return skb->len; |
| 185 | 168 | ||
| 169 | rtattr_failure: | ||
| 186 | nlmsg_failure: | 170 | nlmsg_failure: |
| 187 | skb_trim(skb, b - skb->data); | 171 | skb_trim(skb, b - skb->data); |
| 188 | return -1; | 172 | return -1; |
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c new file mode 100644 index 000000000000..36c51f8136bf --- /dev/null +++ b/net/ipv4/tcp_highspeed.c | |||
| @@ -0,0 +1,181 @@ | |||
| 1 | /* | ||
| 2 | * Sally Floyd's High Speed TCP (RFC 3649) congestion control | ||
| 3 | * | ||
| 4 | * See http://www.icir.org/floyd/hstcp.html | ||
| 5 | * | ||
| 6 | * John Heffner <jheffner@psc.edu> | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/config.h> | ||
| 10 | #include <linux/module.h> | ||
| 11 | #include <net/tcp.h> | ||
| 12 | |||
| 13 | |||
| 14 | /* From AIMD tables from RFC 3649 appendix B, | ||
| 15 | * with fixed-point MD scaled <<8. | ||
| 16 | */ | ||
| 17 | static const struct hstcp_aimd_val { | ||
| 18 | unsigned int cwnd; | ||
| 19 | unsigned int md; | ||
| 20 | } hstcp_aimd_vals[] = { | ||
| 21 | { 38, 128, /* 0.50 */ }, | ||
| 22 | { 118, 112, /* 0.44 */ }, | ||
| 23 | { 221, 104, /* 0.41 */ }, | ||
| 24 | { 347, 98, /* 0.38 */ }, | ||
| 25 | { 495, 93, /* 0.37 */ }, | ||
| 26 | { 663, 89, /* 0.35 */ }, | ||
| 27 | { 851, 86, /* 0.34 */ }, | ||
| 28 | { 1058, 83, /* 0.33 */ }, | ||
| 29 | { 1284, 81, /* 0.32 */ }, | ||
| 30 | { 1529, 78, /* 0.31 */ }, | ||
| 31 | { 1793, 76, /* 0.30 */ }, | ||
| 32 | { 2076, 74, /* 0.29 */ }, | ||
| 33 | { 2378, 72, /* 0.28 */ }, | ||
| 34 | { 2699, 71, /* 0.28 */ }, | ||
| 35 | { 3039, 69, /* 0.27 */ }, | ||
| 36 | { 3399, 68, /* 0.27 */ }, | ||
| 37 | { 3778, 66, /* 0.26 */ }, | ||
| 38 | { 4177, 65, /* 0.26 */ }, | ||
| 39 | { 4596, 64, /* 0.25 */ }, | ||
| 40 | { 5036, 62, /* 0.25 */ }, | ||
| 41 | { 5497, 61, /* 0.24 */ }, | ||
| 42 | { 5979, 60, /* 0.24 */ }, | ||
| 43 | { 6483, 59, /* 0.23 */ }, | ||
| 44 | { 7009, 58, /* 0.23 */ }, | ||
| 45 | { 7558, 57, /* 0.22 */ }, | ||
| 46 | { 8130, 56, /* 0.22 */ }, | ||
| 47 | { 8726, 55, /* 0.22 */ }, | ||
| 48 | { 9346, 54, /* 0.21 */ }, | ||
| 49 | { 9991, 53, /* 0.21 */ }, | ||
| 50 | { 10661, 52, /* 0.21 */ }, | ||
| 51 | { 11358, 52, /* 0.20 */ }, | ||
| 52 | { 12082, 51, /* 0.20 */ }, | ||
| 53 | { 12834, 50, /* 0.20 */ }, | ||
| 54 | { 13614, 49, /* 0.19 */ }, | ||
| 55 | { 14424, 48, /* 0.19 */ }, | ||
| 56 | { 15265, 48, /* 0.19 */ }, | ||
| 57 | { 16137, 47, /* 0.19 */ }, | ||
| 58 | { 17042, 46, /* 0.18 */ }, | ||
| 59 | { 17981, 45, /* 0.18 */ }, | ||
| 60 | { 18955, 45, /* 0.18 */ }, | ||
| 61 | { 19965, 44, /* 0.17 */ }, | ||
| 62 | { 21013, 43, /* 0.17 */ }, | ||
| 63 | { 22101, 43, /* 0.17 */ }, | ||
| 64 | { 23230, 42, /* 0.17 */ }, | ||
| 65 | { 24402, 41, /* 0.16 */ }, | ||
| 66 | { 25618, 41, /* 0.16 */ }, | ||
| 67 | { 26881, 40, /* 0.16 */ }, | ||
| 68 | { 28193, 39, /* 0.16 */ }, | ||
| 69 | { 29557, 39, /* 0.15 */ }, | ||
| 70 | { 30975, 38, /* 0.15 */ }, | ||
| 71 | { 32450, 38, /* 0.15 */ }, | ||
| 72 | { 33986, 37, /* 0.15 */ }, | ||
| 73 | { 35586, 36, /* 0.14 */ }, | ||
| 74 | { 37253, 36, /* 0.14 */ }, | ||
| 75 | { 38992, 35, /* 0.14 */ }, | ||
| 76 | { 40808, 35, /* 0.14 */ }, | ||
| 77 | { 42707, 34, /* 0.13 */ }, | ||
| 78 | { 44694, 33, /* 0.13 */ }, | ||
| 79 | { 46776, 33, /* 0.13 */ }, | ||
| 80 | { 48961, 32, /* 0.13 */ }, | ||
| 81 | { 51258, 32, /* 0.13 */ }, | ||
| 82 | { 53677, 31, /* 0.12 */ }, | ||
| 83 | { 56230, 30, /* 0.12 */ }, | ||
| 84 | { 58932, 30, /* 0.12 */ }, | ||
| 85 | { 61799, 29, /* 0.12 */ }, | ||
| 86 | { 64851, 28, /* 0.11 */ }, | ||
| 87 | { 68113, 28, /* 0.11 */ }, | ||
| 88 | { 71617, 27, /* 0.11 */ }, | ||
| 89 | { 75401, 26, /* 0.10 */ }, | ||
| 90 | { 79517, 26, /* 0.10 */ }, | ||
| 91 | { 84035, 25, /* 0.10 */ }, | ||
| 92 | { 89053, 24, /* 0.10 */ }, | ||
| 93 | }; | ||
| 94 | |||
| 95 | #define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals) | ||
| 96 | |||
| 97 | struct hstcp { | ||
| 98 | u32 ai; | ||
| 99 | }; | ||
| 100 | |||
| 101 | static void hstcp_init(struct tcp_sock *tp) | ||
| 102 | { | ||
| 103 | struct hstcp *ca = tcp_ca(tp); | ||
| 104 | |||
| 105 | ca->ai = 0; | ||
| 106 | |||
| 107 | /* Ensure the MD arithmetic works. This is somewhat pedantic, | ||
| 108 | * since I don't think we will see a cwnd this large. :) */ | ||
| 109 | tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); | ||
| 110 | } | ||
| 111 | |||
| 112 | static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, | ||
| 113 | u32 in_flight, int good) | ||
| 114 | { | ||
| 115 | struct hstcp *ca = tcp_ca(tp); | ||
| 116 | |||
| 117 | if (in_flight < tp->snd_cwnd) | ||
| 118 | return; | ||
| 119 | |||
| 120 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | ||
| 121 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 122 | tp->snd_cwnd++; | ||
| 123 | } else { | ||
| 124 | /* Update AIMD parameters */ | ||
| 125 | if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { | ||
| 126 | while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && | ||
| 127 | ca->ai < HSTCP_AIMD_MAX) | ||
| 128 | ca->ai++; | ||
| 129 | } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) { | ||
| 130 | while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && | ||
| 131 | ca->ai > 0) | ||
| 132 | ca->ai--; | ||
| 133 | } | ||
| 134 | |||
| 135 | /* Do additive increase */ | ||
| 136 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) { | ||
| 137 | tp->snd_cwnd_cnt += ca->ai; | ||
| 138 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | ||
| 139 | tp->snd_cwnd++; | ||
| 140 | tp->snd_cwnd_cnt -= tp->snd_cwnd; | ||
| 141 | } | ||
| 142 | } | ||
| 143 | } | ||
| 144 | } | ||
| 145 | |||
| 146 | static u32 hstcp_ssthresh(struct tcp_sock *tp) | ||
| 147 | { | ||
| 148 | struct hstcp *ca = tcp_ca(tp); | ||
| 149 | |||
| 150 | /* Do multiplicative decrease */ | ||
| 151 | return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); | ||
| 152 | } | ||
| 153 | |||
| 154 | |||
| 155 | static struct tcp_congestion_ops tcp_highspeed = { | ||
| 156 | .init = hstcp_init, | ||
| 157 | .ssthresh = hstcp_ssthresh, | ||
| 158 | .cong_avoid = hstcp_cong_avoid, | ||
| 159 | .min_cwnd = tcp_reno_min_cwnd, | ||
| 160 | |||
| 161 | .owner = THIS_MODULE, | ||
| 162 | .name = "highspeed" | ||
| 163 | }; | ||
| 164 | |||
| 165 | static int __init hstcp_register(void) | ||
| 166 | { | ||
| 167 | BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE); | ||
| 168 | return tcp_register_congestion_control(&tcp_highspeed); | ||
| 169 | } | ||
| 170 | |||
| 171 | static void __exit hstcp_unregister(void) | ||
| 172 | { | ||
| 173 | tcp_unregister_congestion_control(&tcp_highspeed); | ||
| 174 | } | ||
| 175 | |||
| 176 | module_init(hstcp_register); | ||
| 177 | module_exit(hstcp_unregister); | ||
| 178 | |||
| 179 | MODULE_AUTHOR("John Heffner"); | ||
| 180 | MODULE_LICENSE("GPL"); | ||
| 181 | MODULE_DESCRIPTION("High Speed TCP"); | ||
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c new file mode 100644 index 000000000000..40168275acf9 --- /dev/null +++ b/net/ipv4/tcp_htcp.c | |||
| @@ -0,0 +1,289 @@ | |||
| 1 | /* | ||
| 2 | * H-TCP congestion control. The algorithm is detailed in: | ||
| 3 | * R.N.Shorten, D.J.Leith: | ||
| 4 | * "H-TCP: TCP for high-speed and long-distance networks" | ||
| 5 | * Proc. PFLDnet, Argonne, 2004. | ||
| 6 | * http://www.hamilton.ie/net/htcp3.pdf | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/config.h> | ||
| 10 | #include <linux/mm.h> | ||
| 11 | #include <linux/module.h> | ||
| 12 | #include <net/tcp.h> | ||
| 13 | |||
| 14 | #define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */ | ||
| 15 | #define BETA_MIN (1<<6) /* 0.5 with shift << 7 */ | ||
| 16 | #define BETA_MAX 102 /* 0.8 with shift << 7 */ | ||
| 17 | |||
| 18 | static int use_rtt_scaling = 1; | ||
| 19 | module_param(use_rtt_scaling, int, 0644); | ||
| 20 | MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling"); | ||
| 21 | |||
| 22 | static int use_bandwidth_switch = 1; | ||
| 23 | module_param(use_bandwidth_switch, int, 0644); | ||
| 24 | MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher"); | ||
| 25 | |||
| 26 | struct htcp { | ||
| 27 | u16 alpha; /* Fixed point arith, << 7 */ | ||
| 28 | u8 beta; /* Fixed point arith, << 7 */ | ||
| 29 | u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */ | ||
| 30 | u8 ccount; /* Number of RTTs since last congestion event */ | ||
| 31 | u8 undo_ccount; | ||
| 32 | u16 packetcount; | ||
| 33 | u32 minRTT; | ||
| 34 | u32 maxRTT; | ||
| 35 | u32 snd_cwnd_cnt2; | ||
| 36 | |||
| 37 | u32 undo_maxRTT; | ||
| 38 | u32 undo_old_maxB; | ||
| 39 | |||
| 40 | /* Bandwidth estimation */ | ||
| 41 | u32 minB; | ||
| 42 | u32 maxB; | ||
| 43 | u32 old_maxB; | ||
| 44 | u32 Bi; | ||
| 45 | u32 lasttime; | ||
| 46 | }; | ||
| 47 | |||
| 48 | static inline void htcp_reset(struct htcp *ca) | ||
| 49 | { | ||
| 50 | ca->undo_ccount = ca->ccount; | ||
| 51 | ca->undo_maxRTT = ca->maxRTT; | ||
| 52 | ca->undo_old_maxB = ca->old_maxB; | ||
| 53 | |||
| 54 | ca->ccount = 0; | ||
| 55 | ca->snd_cwnd_cnt2 = 0; | ||
| 56 | } | ||
| 57 | |||
| 58 | static u32 htcp_cwnd_undo(struct tcp_sock *tp) | ||
| 59 | { | ||
| 60 | struct htcp *ca = tcp_ca(tp); | ||
| 61 | ca->ccount = ca->undo_ccount; | ||
| 62 | ca->maxRTT = ca->undo_maxRTT; | ||
| 63 | ca->old_maxB = ca->undo_old_maxB; | ||
| 64 | return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta); | ||
| 65 | } | ||
| 66 | |||
| 67 | static inline void measure_rtt(struct tcp_sock *tp) | ||
| 68 | { | ||
| 69 | struct htcp *ca = tcp_ca(tp); | ||
| 70 | u32 srtt = tp->srtt>>3; | ||
| 71 | |||
| 72 | /* keep track of minimum RTT seen so far, minRTT is zero at first */ | ||
| 73 | if (ca->minRTT > srtt || !ca->minRTT) | ||
| 74 | ca->minRTT = srtt; | ||
| 75 | |||
| 76 | /* max RTT */ | ||
| 77 | if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { | ||
| 78 | if (ca->maxRTT < ca->minRTT) | ||
| 79 | ca->maxRTT = ca->minRTT; | ||
| 80 | if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50) | ||
| 81 | ca->maxRTT = srtt; | ||
| 82 | } | ||
| 83 | } | ||
| 84 | |||
| 85 | static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked) | ||
| 86 | { | ||
| 87 | struct htcp *ca = tcp_ca(tp); | ||
| 88 | u32 now = tcp_time_stamp; | ||
| 89 | |||
| 90 | /* achieved throughput calculations */ | ||
| 91 | if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) { | ||
| 92 | ca->packetcount = 0; | ||
| 93 | ca->lasttime = now; | ||
| 94 | return; | ||
| 95 | } | ||
| 96 | |||
| 97 | ca->packetcount += pkts_acked; | ||
| 98 | |||
| 99 | if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1) | ||
| 100 | && now - ca->lasttime >= ca->minRTT | ||
| 101 | && ca->minRTT > 0) { | ||
| 102 | __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime); | ||
| 103 | if (ca->ccount <= 3) { | ||
| 104 | /* just after backoff */ | ||
| 105 | ca->minB = ca->maxB = ca->Bi = cur_Bi; | ||
| 106 | } else { | ||
| 107 | ca->Bi = (3*ca->Bi + cur_Bi)/4; | ||
| 108 | if (ca->Bi > ca->maxB) | ||
| 109 | ca->maxB = ca->Bi; | ||
| 110 | if (ca->minB > ca->maxB) | ||
| 111 | ca->minB = ca->maxB; | ||
| 112 | } | ||
| 113 | ca->packetcount = 0; | ||
| 114 | ca->lasttime = now; | ||
| 115 | } | ||
| 116 | } | ||
| 117 | |||
| 118 | static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT) | ||
| 119 | { | ||
| 120 | if (use_bandwidth_switch) { | ||
| 121 | u32 maxB = ca->maxB; | ||
| 122 | u32 old_maxB = ca->old_maxB; | ||
| 123 | ca->old_maxB = ca->maxB; | ||
| 124 | |||
| 125 | if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) { | ||
| 126 | ca->beta = BETA_MIN; | ||
| 127 | ca->modeswitch = 0; | ||
| 128 | return; | ||
| 129 | } | ||
| 130 | } | ||
| 131 | |||
| 132 | if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) { | ||
| 133 | ca->beta = (minRTT<<7)/maxRTT; | ||
| 134 | if (ca->beta < BETA_MIN) | ||
| 135 | ca->beta = BETA_MIN; | ||
| 136 | else if (ca->beta > BETA_MAX) | ||
| 137 | ca->beta = BETA_MAX; | ||
| 138 | } else { | ||
| 139 | ca->beta = BETA_MIN; | ||
| 140 | ca->modeswitch = 1; | ||
| 141 | } | ||
| 142 | } | ||
| 143 | |||
| 144 | static inline void htcp_alpha_update(struct htcp *ca) | ||
| 145 | { | ||
| 146 | u32 minRTT = ca->minRTT; | ||
| 147 | u32 factor = 1; | ||
| 148 | u32 diff = ca->ccount * minRTT; /* time since last backoff */ | ||
| 149 | |||
| 150 | if (diff > HZ) { | ||
| 151 | diff -= HZ; | ||
| 152 | factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ; | ||
| 153 | } | ||
| 154 | |||
| 155 | if (use_rtt_scaling && minRTT) { | ||
| 156 | u32 scale = (HZ<<3)/(10*minRTT); | ||
| 157 | scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */ | ||
| 158 | factor = (factor<<3)/scale; | ||
| 159 | if (!factor) | ||
| 160 | factor = 1; | ||
| 161 | } | ||
| 162 | |||
| 163 | ca->alpha = 2*factor*((1<<7)-ca->beta); | ||
| 164 | if (!ca->alpha) | ||
| 165 | ca->alpha = ALPHA_BASE; | ||
| 166 | } | ||
| 167 | |||
| 168 | /* After we have the rtt data to calculate beta, we'd still prefer to wait one | ||
| 169 | * rtt before we adjust our beta to ensure we are working from a consistent | ||
| 170 | * data. | ||
| 171 | * | ||
| 172 | * This function should be called when we hit a congestion event since only at | ||
| 173 | * that point do we really have a real sense of maxRTT (the queues en route | ||
| 174 | * were getting just too full now). | ||
| 175 | */ | ||
| 176 | static void htcp_param_update(struct tcp_sock *tp) | ||
| 177 | { | ||
| 178 | struct htcp *ca = tcp_ca(tp); | ||
| 179 | u32 minRTT = ca->minRTT; | ||
| 180 | u32 maxRTT = ca->maxRTT; | ||
| 181 | |||
| 182 | htcp_beta_update(ca, minRTT, maxRTT); | ||
| 183 | htcp_alpha_update(ca); | ||
| 184 | |||
| 185 | /* add slowly fading memory for maxRTT to accommodate routing changes etc */ | ||
| 186 | if (minRTT > 0 && maxRTT > minRTT) | ||
| 187 | ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100; | ||
| 188 | } | ||
| 189 | |||
| 190 | static u32 htcp_recalc_ssthresh(struct tcp_sock *tp) | ||
| 191 | { | ||
| 192 | struct htcp *ca = tcp_ca(tp); | ||
| 193 | htcp_param_update(tp); | ||
| 194 | return max((tp->snd_cwnd * ca->beta) >> 7, 2U); | ||
| 195 | } | ||
| 196 | |||
| 197 | static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, | ||
| 198 | u32 in_flight, int data_acked) | ||
| 199 | { | ||
| 200 | struct htcp *ca = tcp_ca(tp); | ||
| 201 | |||
| 202 | if (in_flight < tp->snd_cwnd) | ||
| 203 | return; | ||
| 204 | |||
| 205 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | ||
| 206 | /* In "safe" area, increase. */ | ||
| 207 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 208 | tp->snd_cwnd++; | ||
| 209 | } else { | ||
| 210 | measure_rtt(tp); | ||
| 211 | |||
| 212 | /* keep track of number of round-trip times since last backoff event */ | ||
| 213 | if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) { | ||
| 214 | ca->ccount++; | ||
| 215 | ca->snd_cwnd_cnt2 = 0; | ||
| 216 | htcp_alpha_update(ca); | ||
| 217 | } | ||
| 218 | |||
| 219 | /* In dangerous area, increase slowly. | ||
| 220 | * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd | ||
| 221 | */ | ||
| 222 | if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { | ||
| 223 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 224 | tp->snd_cwnd++; | ||
| 225 | tp->snd_cwnd_cnt = 0; | ||
| 226 | ca->ccount++; | ||
| 227 | } | ||
| 228 | } | ||
| 229 | } | ||
| 230 | |||
| 231 | /* Lower bound on congestion window. */ | ||
| 232 | static u32 htcp_min_cwnd(struct tcp_sock *tp) | ||
| 233 | { | ||
| 234 | return tp->snd_ssthresh; | ||
| 235 | } | ||
| 236 | |||
| 237 | |||
| 238 | static void htcp_init(struct tcp_sock *tp) | ||
| 239 | { | ||
| 240 | struct htcp *ca = tcp_ca(tp); | ||
| 241 | |||
| 242 | memset(ca, 0, sizeof(struct htcp)); | ||
| 243 | ca->alpha = ALPHA_BASE; | ||
| 244 | ca->beta = BETA_MIN; | ||
| 245 | } | ||
| 246 | |||
| 247 | static void htcp_state(struct tcp_sock *tp, u8 new_state) | ||
| 248 | { | ||
| 249 | switch (new_state) { | ||
| 250 | case TCP_CA_CWR: | ||
| 251 | case TCP_CA_Recovery: | ||
| 252 | case TCP_CA_Loss: | ||
| 253 | htcp_reset(tcp_ca(tp)); | ||
| 254 | break; | ||
| 255 | } | ||
| 256 | } | ||
| 257 | |||
| 258 | static struct tcp_congestion_ops htcp = { | ||
| 259 | .init = htcp_init, | ||
| 260 | .ssthresh = htcp_recalc_ssthresh, | ||
| 261 | .min_cwnd = htcp_min_cwnd, | ||
| 262 | .cong_avoid = htcp_cong_avoid, | ||
| 263 | .set_state = htcp_state, | ||
| 264 | .undo_cwnd = htcp_cwnd_undo, | ||
| 265 | .pkts_acked = measure_achieved_throughput, | ||
| 266 | .owner = THIS_MODULE, | ||
| 267 | .name = "htcp", | ||
| 268 | }; | ||
| 269 | |||
| 270 | static int __init htcp_register(void) | ||
| 271 | { | ||
| 272 | BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE); | ||
| 273 | BUILD_BUG_ON(BETA_MIN >= BETA_MAX); | ||
| 274 | if (!use_bandwidth_switch) | ||
| 275 | htcp.pkts_acked = NULL; | ||
| 276 | return tcp_register_congestion_control(&htcp); | ||
| 277 | } | ||
| 278 | |||
| 279 | static void __exit htcp_unregister(void) | ||
| 280 | { | ||
| 281 | tcp_unregister_congestion_control(&htcp); | ||
| 282 | } | ||
| 283 | |||
| 284 | module_init(htcp_register); | ||
| 285 | module_exit(htcp_unregister); | ||
| 286 | |||
| 287 | MODULE_AUTHOR("Baruch Even"); | ||
| 288 | MODULE_LICENSE("GPL"); | ||
| 289 | MODULE_DESCRIPTION("H-TCP"); | ||
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c new file mode 100644 index 000000000000..13a66342c304 --- /dev/null +++ b/net/ipv4/tcp_hybla.c | |||
| @@ -0,0 +1,187 @@ | |||
| 1 | /* | ||
| 2 | * TCP HYBLA | ||
| 3 | * | ||
| 4 | * TCP-HYBLA Congestion control algorithm, based on: | ||
| 5 | * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement | ||
| 6 | * for Heterogeneous Networks", | ||
| 7 | * International Journal on satellite Communications, | ||
| 8 | * September 2004 | ||
| 9 | * Daniele Lacamera | ||
| 10 | * root at danielinux.net | ||
| 11 | */ | ||
| 12 | |||
| 13 | #include <linux/config.h> | ||
| 14 | #include <linux/module.h> | ||
| 15 | #include <net/tcp.h> | ||
| 16 | |||
| 17 | /* Tcp Hybla structure. */ | ||
| 18 | struct hybla { | ||
| 19 | u8 hybla_en; | ||
| 20 | u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ | ||
| 21 | u32 rho; /* Rho parameter, integer part */ | ||
| 22 | u32 rho2; /* Rho * Rho, integer part */ | ||
| 23 | u32 rho_3ls; /* Rho parameter, <<3 */ | ||
| 24 | u32 rho2_7ls; /* Rho^2, <<7 */ | ||
| 25 | u32 minrtt; /* Minimum smoothed round trip time value seen */ | ||
| 26 | }; | ||
| 27 | |||
| 28 | /* Hybla reference round trip time (default= 1/40 sec = 25 ms), | ||
| 29 | expressed in jiffies */ | ||
| 30 | static int rtt0 = 25; | ||
| 31 | module_param(rtt0, int, 0644); | ||
| 32 | MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); | ||
| 33 | |||
| 34 | |||
| 35 | /* This is called to refresh values for hybla parameters */ | ||
| 36 | static inline void hybla_recalc_param (struct tcp_sock *tp) | ||
| 37 | { | ||
| 38 | struct hybla *ca = tcp_ca(tp); | ||
| 39 | |||
| 40 | ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8); | ||
| 41 | ca->rho = ca->rho_3ls >> 3; | ||
| 42 | ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; | ||
| 43 | ca->rho2 = ca->rho2_7ls >>7; | ||
| 44 | } | ||
| 45 | |||
| 46 | static void hybla_init(struct tcp_sock *tp) | ||
| 47 | { | ||
| 48 | struct hybla *ca = tcp_ca(tp); | ||
| 49 | |||
| 50 | ca->rho = 0; | ||
| 51 | ca->rho2 = 0; | ||
| 52 | ca->rho_3ls = 0; | ||
| 53 | ca->rho2_7ls = 0; | ||
| 54 | ca->snd_cwnd_cents = 0; | ||
| 55 | ca->hybla_en = 1; | ||
| 56 | tp->snd_cwnd = 2; | ||
| 57 | tp->snd_cwnd_clamp = 65535; | ||
| 58 | |||
| 59 | /* 1st Rho measurement based on initial srtt */ | ||
| 60 | hybla_recalc_param(tp); | ||
| 61 | |||
| 62 | /* set minimum rtt as this is the 1st ever seen */ | ||
| 63 | ca->minrtt = tp->srtt; | ||
| 64 | tp->snd_cwnd = ca->rho; | ||
| 65 | } | ||
| 66 | |||
| 67 | static void hybla_state(struct tcp_sock *tp, u8 ca_state) | ||
| 68 | { | ||
| 69 | struct hybla *ca = tcp_ca(tp); | ||
| 70 | |||
| 71 | ca->hybla_en = (ca_state == TCP_CA_Open); | ||
| 72 | } | ||
| 73 | |||
| 74 | static inline u32 hybla_fraction(u32 odds) | ||
| 75 | { | ||
| 76 | static const u32 fractions[] = { | ||
| 77 | 128, 139, 152, 165, 181, 197, 215, 234, | ||
| 78 | }; | ||
| 79 | |||
| 80 | return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128; | ||
| 81 | } | ||
| 82 | |||
| 83 | /* TCP Hybla main routine. | ||
| 84 | * This is the algorithm behavior: | ||
| 85 | * o Recalc Hybla parameters if min_rtt has changed | ||
| 86 | * o Give cwnd a new value based on the model proposed | ||
| 87 | * o remember increments <1 | ||
| 88 | */ | ||
| 89 | static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, | ||
| 90 | u32 in_flight, int flag) | ||
| 91 | { | ||
| 92 | struct hybla *ca = tcp_ca(tp); | ||
| 93 | u32 increment, odd, rho_fractions; | ||
| 94 | int is_slowstart = 0; | ||
| 95 | |||
| 96 | /* Recalculate rho only if this srtt is the lowest */ | ||
| 97 | if (tp->srtt < ca->minrtt){ | ||
| 98 | hybla_recalc_param(tp); | ||
| 99 | ca->minrtt = tp->srtt; | ||
| 100 | } | ||
| 101 | |||
| 102 | if (!ca->hybla_en) | ||
| 103 | return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag); | ||
| 104 | |||
| 105 | if (in_flight < tp->snd_cwnd) | ||
| 106 | return; | ||
| 107 | |||
| 108 | if (ca->rho == 0) | ||
| 109 | hybla_recalc_param(tp); | ||
| 110 | |||
| 111 | rho_fractions = ca->rho_3ls - (ca->rho << 3); | ||
| 112 | |||
| 113 | if (tp->snd_cwnd < tp->snd_ssthresh) { | ||
| 114 | /* | ||
| 115 | * slow start | ||
| 116 | * INC = 2^RHO - 1 | ||
| 117 | * This is done by splitting the rho parameter | ||
| 118 | * into 2 parts: an integer part and a fraction part. | ||
| 119 | * Inrement<<7 is estimated by doing: | ||
| 120 | * [2^(int+fract)]<<7 | ||
| 121 | * that is equal to: | ||
| 122 | * (2^int) * [(2^fract) <<7] | ||
| 123 | * 2^int is straightly computed as 1<<int, | ||
| 124 | * while we will use hybla_slowstart_fraction_increment() to | ||
| 125 | * calculate 2^fract in a <<7 value. | ||
| 126 | */ | ||
| 127 | is_slowstart = 1; | ||
| 128 | increment = ((1 << ca->rho) * hybla_fraction(rho_fractions)) | ||
| 129 | - 128; | ||
| 130 | } else { | ||
| 131 | /* | ||
| 132 | * congestion avoidance | ||
| 133 | * INC = RHO^2 / W | ||
| 134 | * as long as increment is estimated as (rho<<7)/window | ||
| 135 | * it already is <<7 and we can easily count its fractions. | ||
| 136 | */ | ||
| 137 | increment = ca->rho2_7ls / tp->snd_cwnd; | ||
| 138 | if (increment < 128) | ||
| 139 | tp->snd_cwnd_cnt++; | ||
| 140 | } | ||
| 141 | |||
| 142 | odd = increment % 128; | ||
| 143 | tp->snd_cwnd += increment >> 7; | ||
| 144 | ca->snd_cwnd_cents += odd; | ||
| 145 | |||
| 146 | /* check when fractions goes >=128 and increase cwnd by 1. */ | ||
| 147 | while(ca->snd_cwnd_cents >= 128) { | ||
| 148 | tp->snd_cwnd++; | ||
| 149 | ca->snd_cwnd_cents -= 128; | ||
| 150 | tp->snd_cwnd_cnt = 0; | ||
| 151 | } | ||
| 152 | |||
| 153 | /* clamp down slowstart cwnd to ssthresh value. */ | ||
| 154 | if (is_slowstart) | ||
| 155 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); | ||
| 156 | |||
| 157 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
| 158 | } | ||
| 159 | |||
| 160 | static struct tcp_congestion_ops tcp_hybla = { | ||
| 161 | .init = hybla_init, | ||
| 162 | .ssthresh = tcp_reno_ssthresh, | ||
| 163 | .min_cwnd = tcp_reno_min_cwnd, | ||
| 164 | .cong_avoid = hybla_cong_avoid, | ||
| 165 | .set_state = hybla_state, | ||
| 166 | |||
| 167 | .owner = THIS_MODULE, | ||
| 168 | .name = "hybla" | ||
| 169 | }; | ||
| 170 | |||
| 171 | static int __init hybla_register(void) | ||
| 172 | { | ||
| 173 | BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE); | ||
| 174 | return tcp_register_congestion_control(&tcp_hybla); | ||
| 175 | } | ||
| 176 | |||
| 177 | static void __exit hybla_unregister(void) | ||
| 178 | { | ||
| 179 | tcp_unregister_congestion_control(&tcp_hybla); | ||
| 180 | } | ||
| 181 | |||
| 182 | module_init(hybla_register); | ||
| 183 | module_exit(hybla_unregister); | ||
| 184 | |||
| 185 | MODULE_AUTHOR("Daniele Lacamera"); | ||
| 186 | MODULE_LICENSE("GPL"); | ||
| 187 | MODULE_DESCRIPTION("TCP Hybla"); | ||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bad504630a3..7bbbbc33eb4b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
| @@ -61,7 +61,6 @@ | |||
| 61 | * Panu Kuhlberg: Experimental audit of TCP (re)transmission | 61 | * Panu Kuhlberg: Experimental audit of TCP (re)transmission |
| 62 | * engine. Lots of bugs are found. | 62 | * engine. Lots of bugs are found. |
| 63 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs | 63 | * Pasi Sarolahti: F-RTO for dealing with spurious RTOs |
| 64 | * Angelo Dell'Aera: TCP Westwood+ support | ||
| 65 | */ | 64 | */ |
| 66 | 65 | ||
| 67 | #include <linux/config.h> | 66 | #include <linux/config.h> |
| @@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337; | |||
| 88 | int sysctl_tcp_max_orphans = NR_FILE; | 87 | int sysctl_tcp_max_orphans = NR_FILE; |
| 89 | int sysctl_tcp_frto; | 88 | int sysctl_tcp_frto; |
| 90 | int sysctl_tcp_nometrics_save; | 89 | int sysctl_tcp_nometrics_save; |
| 91 | int sysctl_tcp_westwood; | ||
| 92 | int sysctl_tcp_vegas_cong_avoid; | ||
| 93 | 90 | ||
| 94 | int sysctl_tcp_moderate_rcvbuf = 1; | 91 | int sysctl_tcp_moderate_rcvbuf = 1; |
| 95 | 92 | ||
| 96 | /* Default values of the Vegas variables, in fixed-point representation | ||
| 97 | * with V_PARAM_SHIFT bits to the right of the binary point. | ||
| 98 | */ | ||
| 99 | #define V_PARAM_SHIFT 1 | ||
| 100 | int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT; | ||
| 101 | int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT; | ||
| 102 | int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT; | ||
| 103 | int sysctl_tcp_bic = 1; | ||
| 104 | int sysctl_tcp_bic_fast_convergence = 1; | ||
| 105 | int sysctl_tcp_bic_low_window = 14; | ||
| 106 | int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ | ||
| 107 | |||
| 108 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ | 93 | #define FLAG_DATA 0x01 /* Incoming frame contained data. */ |
| 109 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ | 94 | #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ |
| 110 | #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ | 95 | #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ |
| @@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk) | |||
| 333 | tp->snd_cwnd_stamp = tcp_time_stamp; | 318 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 334 | } | 319 | } |
| 335 | 320 | ||
| 336 | static void init_bictcp(struct tcp_sock *tp) | ||
| 337 | { | ||
| 338 | tp->bictcp.cnt = 0; | ||
| 339 | |||
| 340 | tp->bictcp.last_max_cwnd = 0; | ||
| 341 | tp->bictcp.last_cwnd = 0; | ||
| 342 | tp->bictcp.last_stamp = 0; | ||
| 343 | } | ||
| 344 | |||
| 345 | /* 5. Recalculate window clamp after socket hit its memory bounds. */ | 321 | /* 5. Recalculate window clamp after socket hit its memory bounds. */ |
| 346 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) | 322 | static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) |
| 347 | { | 323 | { |
| @@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ | |||
| 558 | tcp_grow_window(sk, tp, skb); | 534 | tcp_grow_window(sk, tp, skb); |
| 559 | } | 535 | } |
| 560 | 536 | ||
| 561 | /* When starting a new connection, pin down the current choice of | ||
| 562 | * congestion algorithm. | ||
| 563 | */ | ||
| 564 | void tcp_ca_init(struct tcp_sock *tp) | ||
| 565 | { | ||
| 566 | if (sysctl_tcp_westwood) | ||
| 567 | tp->adv_cong = TCP_WESTWOOD; | ||
| 568 | else if (sysctl_tcp_bic) | ||
| 569 | tp->adv_cong = TCP_BIC; | ||
| 570 | else if (sysctl_tcp_vegas_cong_avoid) { | ||
| 571 | tp->adv_cong = TCP_VEGAS; | ||
| 572 | tp->vegas.baseRTT = 0x7fffffff; | ||
| 573 | tcp_vegas_enable(tp); | ||
| 574 | } | ||
| 575 | } | ||
| 576 | |||
| 577 | /* Do RTT sampling needed for Vegas. | ||
| 578 | * Basically we: | ||
| 579 | * o min-filter RTT samples from within an RTT to get the current | ||
| 580 | * propagation delay + queuing delay (we are min-filtering to try to | ||
| 581 | * avoid the effects of delayed ACKs) | ||
| 582 | * o min-filter RTT samples from a much longer window (forever for now) | ||
| 583 | * to find the propagation delay (baseRTT) | ||
| 584 | */ | ||
| 585 | static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) | ||
| 586 | { | ||
| 587 | __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ | ||
| 588 | |||
| 589 | /* Filter to find propagation delay: */ | ||
| 590 | if (vrtt < tp->vegas.baseRTT) | ||
| 591 | tp->vegas.baseRTT = vrtt; | ||
| 592 | |||
| 593 | /* Find the min RTT during the last RTT to find | ||
| 594 | * the current prop. delay + queuing delay: | ||
| 595 | */ | ||
| 596 | tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); | ||
| 597 | tp->vegas.cntRTT++; | ||
| 598 | } | ||
| 599 | |||
| 600 | /* Called to compute a smoothed rtt estimate. The data fed to this | 537 | /* Called to compute a smoothed rtt estimate. The data fed to this |
| 601 | * routine either comes from timestamps, or from segments that were | 538 | * routine either comes from timestamps, or from segments that were |
| 602 | * known _not_ to have been retransmitted [see Karn/Partridge | 539 | * known _not_ to have been retransmitted [see Karn/Partridge |
| @@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) | |||
| 606 | * To save cycles in the RFC 1323 implementation it was better to break | 543 | * To save cycles in the RFC 1323 implementation it was better to break |
| 607 | * it up into three procedures. -- erics | 544 | * it up into three procedures. -- erics |
| 608 | */ | 545 | */ |
| 609 | static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) | 546 | static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) |
| 610 | { | 547 | { |
| 611 | long m = mrtt; /* RTT */ | 548 | long m = mrtt; /* RTT */ |
| 612 | 549 | ||
| 613 | if (tcp_vegas_enabled(tp)) | ||
| 614 | vegas_rtt_calc(tp, mrtt); | ||
| 615 | |||
| 616 | /* The following amusing code comes from Jacobson's | 550 | /* The following amusing code comes from Jacobson's |
| 617 | * article in SIGCOMM '88. Note that rtt and mdev | 551 | * article in SIGCOMM '88. Note that rtt and mdev |
| 618 | * are scaled versions of rtt and mean deviation. | 552 | * are scaled versions of rtt and mean deviation. |
| @@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) | |||
| 670 | tp->rtt_seq = tp->snd_nxt; | 604 | tp->rtt_seq = tp->snd_nxt; |
| 671 | } | 605 | } |
| 672 | 606 | ||
| 673 | tcp_westwood_update_rtt(tp, tp->srtt >> 3); | 607 | if (tp->ca_ops->rtt_sample) |
| 608 | tp->ca_ops->rtt_sample(tp, *usrtt); | ||
| 674 | } | 609 | } |
| 675 | 610 | ||
| 676 | /* Calculate rto without backoff. This is the second half of Van Jacobson's | 611 | /* Calculate rto without backoff. This is the second half of Van Jacobson's |
| @@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk) | |||
| 1185 | tp->snd_una == tp->high_seq || | 1120 | tp->snd_una == tp->high_seq || |
| 1186 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { | 1121 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { |
| 1187 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1122 | tp->prior_ssthresh = tcp_current_ssthresh(tp); |
| 1188 | if (!tcp_westwood_ssthresh(tp)) | 1123 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
| 1189 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1124 | tcp_ca_event(tp, CA_EVENT_FRTO); |
| 1190 | } | 1125 | } |
| 1191 | 1126 | ||
| 1192 | /* Have to clear retransmission markers here to keep the bookkeeping | 1127 | /* Have to clear retransmission markers here to keep the bookkeeping |
| @@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk) | |||
| 1252 | tcp_set_ca_state(tp, TCP_CA_Loss); | 1187 | tcp_set_ca_state(tp, TCP_CA_Loss); |
| 1253 | tp->high_seq = tp->frto_highmark; | 1188 | tp->high_seq = tp->frto_highmark; |
| 1254 | TCP_ECN_queue_cwr(tp); | 1189 | TCP_ECN_queue_cwr(tp); |
| 1255 | |||
| 1256 | init_bictcp(tp); | ||
| 1257 | } | 1190 | } |
| 1258 | 1191 | ||
| 1259 | void tcp_clear_retrans(struct tcp_sock *tp) | 1192 | void tcp_clear_retrans(struct tcp_sock *tp) |
| @@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how) | |||
| 1283 | if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || | 1216 | if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || |
| 1284 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { | 1217 | (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { |
| 1285 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1218 | tp->prior_ssthresh = tcp_current_ssthresh(tp); |
| 1286 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1219 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
| 1220 | tcp_ca_event(tp, CA_EVENT_LOSS); | ||
| 1287 | } | 1221 | } |
| 1288 | tp->snd_cwnd = 1; | 1222 | tp->snd_cwnd = 1; |
| 1289 | tp->snd_cwnd_cnt = 0; | 1223 | tp->snd_cwnd_cnt = 0; |
| @@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) | |||
| 1596 | } | 1530 | } |
| 1597 | 1531 | ||
| 1598 | /* Decrease cwnd each second ack. */ | 1532 | /* Decrease cwnd each second ack. */ |
| 1599 | |||
| 1600 | static void tcp_cwnd_down(struct tcp_sock *tp) | 1533 | static void tcp_cwnd_down(struct tcp_sock *tp) |
| 1601 | { | 1534 | { |
| 1602 | int decr = tp->snd_cwnd_cnt + 1; | 1535 | int decr = tp->snd_cwnd_cnt + 1; |
| 1603 | __u32 limit; | ||
| 1604 | |||
| 1605 | /* | ||
| 1606 | * TCP Westwood | ||
| 1607 | * Here limit is evaluated as BWestimation*RTTmin (for obtaining it | ||
| 1608 | * in packets we use mss_cache). If sysctl_tcp_westwood is off | ||
| 1609 | * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is | ||
| 1610 | * still used as usual. It prevents other strange cases in which | ||
| 1611 | * BWE*RTTmin could assume value 0. It should not happen but... | ||
| 1612 | */ | ||
| 1613 | |||
| 1614 | if (!(limit = tcp_westwood_bw_rttmin(tp))) | ||
| 1615 | limit = tp->snd_ssthresh/2; | ||
| 1616 | 1536 | ||
| 1617 | tp->snd_cwnd_cnt = decr&1; | 1537 | tp->snd_cwnd_cnt = decr&1; |
| 1618 | decr >>= 1; | 1538 | decr >>= 1; |
| 1619 | 1539 | ||
| 1620 | if (decr && tp->snd_cwnd > limit) | 1540 | if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) |
| 1621 | tp->snd_cwnd -= decr; | 1541 | tp->snd_cwnd -= decr; |
| 1622 | 1542 | ||
| 1623 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); | 1543 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); |
| @@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) | |||
| 1654 | static void tcp_undo_cwr(struct tcp_sock *tp, int undo) | 1574 | static void tcp_undo_cwr(struct tcp_sock *tp, int undo) |
| 1655 | { | 1575 | { |
| 1656 | if (tp->prior_ssthresh) { | 1576 | if (tp->prior_ssthresh) { |
| 1657 | if (tcp_is_bic(tp)) | 1577 | if (tp->ca_ops->undo_cwnd) |
| 1658 | tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); | 1578 | tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); |
| 1659 | else | 1579 | else |
| 1660 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); | 1580 | tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); |
| 1661 | 1581 | ||
| @@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) | |||
| 1767 | 1687 | ||
| 1768 | static inline void tcp_complete_cwr(struct tcp_sock *tp) | 1688 | static inline void tcp_complete_cwr(struct tcp_sock *tp) |
| 1769 | { | 1689 | { |
| 1770 | if (tcp_westwood_cwnd(tp)) | 1690 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); |
| 1771 | tp->snd_ssthresh = tp->snd_cwnd; | ||
| 1772 | else | ||
| 1773 | tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); | ||
| 1774 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1691 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 1692 | tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); | ||
| 1775 | } | 1693 | } |
| 1776 | 1694 | ||
| 1777 | static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) | 1695 | static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) |
| @@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1946 | if (tp->ca_state < TCP_CA_CWR) { | 1864 | if (tp->ca_state < TCP_CA_CWR) { |
| 1947 | if (!(flag&FLAG_ECE)) | 1865 | if (!(flag&FLAG_ECE)) |
| 1948 | tp->prior_ssthresh = tcp_current_ssthresh(tp); | 1866 | tp->prior_ssthresh = tcp_current_ssthresh(tp); |
| 1949 | tp->snd_ssthresh = tcp_recalc_ssthresh(tp); | 1867 | tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); |
| 1950 | TCP_ECN_queue_cwr(tp); | 1868 | TCP_ECN_queue_cwr(tp); |
| 1951 | } | 1869 | } |
| 1952 | 1870 | ||
| @@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, | |||
| 1963 | /* Read draft-ietf-tcplw-high-performance before mucking | 1881 | /* Read draft-ietf-tcplw-high-performance before mucking |
| 1964 | * with this code. (Superceeds RFC1323) | 1882 | * with this code. (Superceeds RFC1323) |
| 1965 | */ | 1883 | */ |
| 1966 | static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) | 1884 | static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) |
| 1967 | { | 1885 | { |
| 1968 | __u32 seq_rtt; | 1886 | __u32 seq_rtt; |
| 1969 | 1887 | ||
| @@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) | |||
| 1983 | * in window is lost... Voila. --ANK (010210) | 1901 | * in window is lost... Voila. --ANK (010210) |
| 1984 | */ | 1902 | */ |
| 1985 | seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | 1903 | seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; |
| 1986 | tcp_rtt_estimator(tp, seq_rtt); | 1904 | tcp_rtt_estimator(tp, seq_rtt, usrtt); |
| 1987 | tcp_set_rto(tp); | 1905 | tcp_set_rto(tp); |
| 1988 | tp->backoff = 0; | 1906 | tp->backoff = 0; |
| 1989 | tcp_bound_rto(tp); | 1907 | tcp_bound_rto(tp); |
| 1990 | } | 1908 | } |
| 1991 | 1909 | ||
| 1992 | static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) | 1910 | static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) |
| 1993 | { | 1911 | { |
| 1994 | /* We don't have a timestamp. Can only use | 1912 | /* We don't have a timestamp. Can only use |
| 1995 | * packets that are not retransmitted to determine | 1913 | * packets that are not retransmitted to determine |
| @@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) | |||
| 2003 | if (flag & FLAG_RETRANS_DATA_ACKED) | 1921 | if (flag & FLAG_RETRANS_DATA_ACKED) |
| 2004 | return; | 1922 | return; |
| 2005 | 1923 | ||
| 2006 | tcp_rtt_estimator(tp, seq_rtt); | 1924 | tcp_rtt_estimator(tp, seq_rtt, usrtt); |
| 2007 | tcp_set_rto(tp); | 1925 | tcp_set_rto(tp); |
| 2008 | tp->backoff = 0; | 1926 | tp->backoff = 0; |
| 2009 | tcp_bound_rto(tp); | 1927 | tcp_bound_rto(tp); |
| 2010 | } | 1928 | } |
| 2011 | 1929 | ||
| 2012 | static inline void tcp_ack_update_rtt(struct tcp_sock *tp, | 1930 | static inline void tcp_ack_update_rtt(struct tcp_sock *tp, |
| 2013 | int flag, s32 seq_rtt) | 1931 | int flag, s32 seq_rtt, u32 *usrtt) |
| 2014 | { | 1932 | { |
| 2015 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ | 1933 | /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ |
| 2016 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) | 1934 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) |
| 2017 | tcp_ack_saw_tstamp(tp, flag); | 1935 | tcp_ack_saw_tstamp(tp, usrtt, flag); |
| 2018 | else if (seq_rtt >= 0) | 1936 | else if (seq_rtt >= 0) |
| 2019 | tcp_ack_no_tstamp(tp, seq_rtt, flag); | 1937 | tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); |
| 2020 | } | 1938 | } |
| 2021 | 1939 | ||
| 2022 | /* | 1940 | static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, |
| 2023 | * Compute congestion window to use. | 1941 | u32 in_flight, int good) |
| 2024 | * | ||
| 2025 | * This is from the implementation of BICTCP in | ||
| 2026 | * Lison-Xu, Kahaled Harfoush, and Injog Rhee. | ||
| 2027 | * "Binary Increase Congestion Control for Fast, Long Distance | ||
| 2028 | * Networks" in InfoComm 2004 | ||
| 2029 | * Available from: | ||
| 2030 | * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf | ||
| 2031 | * | ||
| 2032 | * Unless BIC is enabled and congestion window is large | ||
| 2033 | * this behaves the same as the original Reno. | ||
| 2034 | */ | ||
| 2035 | static inline __u32 bictcp_cwnd(struct tcp_sock *tp) | ||
| 2036 | { | ||
| 2037 | /* orignal Reno behaviour */ | ||
| 2038 | if (!tcp_is_bic(tp)) | ||
| 2039 | return tp->snd_cwnd; | ||
| 2040 | |||
| 2041 | if (tp->bictcp.last_cwnd == tp->snd_cwnd && | ||
| 2042 | (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5)) | ||
| 2043 | return tp->bictcp.cnt; | ||
| 2044 | |||
| 2045 | tp->bictcp.last_cwnd = tp->snd_cwnd; | ||
| 2046 | tp->bictcp.last_stamp = tcp_time_stamp; | ||
| 2047 | |||
| 2048 | /* start off normal */ | ||
| 2049 | if (tp->snd_cwnd <= sysctl_tcp_bic_low_window) | ||
| 2050 | tp->bictcp.cnt = tp->snd_cwnd; | ||
| 2051 | |||
| 2052 | /* binary increase */ | ||
| 2053 | else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) { | ||
| 2054 | __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd) | ||
| 2055 | / BICTCP_B; | ||
| 2056 | |||
| 2057 | if (dist > BICTCP_MAX_INCREMENT) | ||
| 2058 | /* linear increase */ | ||
| 2059 | tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; | ||
| 2060 | else if (dist <= 1U) | ||
| 2061 | /* binary search increase */ | ||
| 2062 | tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR | ||
| 2063 | / BICTCP_B; | ||
| 2064 | else | ||
| 2065 | /* binary search increase */ | ||
| 2066 | tp->bictcp.cnt = tp->snd_cwnd / dist; | ||
| 2067 | } else { | ||
| 2068 | /* slow start amd linear increase */ | ||
| 2069 | if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B) | ||
| 2070 | /* slow start */ | ||
| 2071 | tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR | ||
| 2072 | / BICTCP_B; | ||
| 2073 | else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd | ||
| 2074 | + BICTCP_MAX_INCREMENT*(BICTCP_B-1)) | ||
| 2075 | /* slow start */ | ||
| 2076 | tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1) | ||
| 2077 | / (tp->snd_cwnd-tp->bictcp.last_max_cwnd); | ||
| 2078 | else | ||
| 2079 | /* linear increase */ | ||
| 2080 | tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; | ||
| 2081 | } | ||
| 2082 | return tp->bictcp.cnt; | ||
| 2083 | } | ||
| 2084 | |||
| 2085 | /* This is Jacobson's slow start and congestion avoidance. | ||
| 2086 | * SIGCOMM '88, p. 328. | ||
| 2087 | */ | ||
| 2088 | static inline void reno_cong_avoid(struct tcp_sock *tp) | ||
| 2089 | { | 1942 | { |
| 2090 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | 1943 | tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); |
| 2091 | /* In "safe" area, increase. */ | ||
| 2092 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 2093 | tp->snd_cwnd++; | ||
| 2094 | } else { | ||
| 2095 | /* In dangerous area, increase slowly. | ||
| 2096 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | ||
| 2097 | */ | ||
| 2098 | if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) { | ||
| 2099 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
| 2100 | tp->snd_cwnd++; | ||
| 2101 | tp->snd_cwnd_cnt=0; | ||
| 2102 | } else | ||
| 2103 | tp->snd_cwnd_cnt++; | ||
| 2104 | } | ||
| 2105 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1944 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 2106 | } | 1945 | } |
| 2107 | 1946 | ||
| 2108 | /* This is based on the congestion detection/avoidance scheme described in | ||
| 2109 | * Lawrence S. Brakmo and Larry L. Peterson. | ||
| 2110 | * "TCP Vegas: End to end congestion avoidance on a global internet." | ||
| 2111 | * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, | ||
| 2112 | * October 1995. Available from: | ||
| 2113 | * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps | ||
| 2114 | * | ||
| 2115 | * See http://www.cs.arizona.edu/xkernel/ for their implementation. | ||
| 2116 | * The main aspects that distinguish this implementation from the | ||
| 2117 | * Arizona Vegas implementation are: | ||
| 2118 | * o We do not change the loss detection or recovery mechanisms of | ||
| 2119 | * Linux in any way. Linux already recovers from losses quite well, | ||
| 2120 | * using fine-grained timers, NewReno, and FACK. | ||
| 2121 | * o To avoid the performance penalty imposed by increasing cwnd | ||
| 2122 | * only every-other RTT during slow start, we increase during | ||
| 2123 | * every RTT during slow start, just like Reno. | ||
| 2124 | * o Largely to allow continuous cwnd growth during slow start, | ||
| 2125 | * we use the rate at which ACKs come back as the "actual" | ||
| 2126 | * rate, rather than the rate at which data is sent. | ||
| 2127 | * o To speed convergence to the right rate, we set the cwnd | ||
| 2128 | * to achieve the right ("actual") rate when we exit slow start. | ||
| 2129 | * o To filter out the noise caused by delayed ACKs, we use the | ||
| 2130 | * minimum RTT sample observed during the last RTT to calculate | ||
| 2131 | * the actual rate. | ||
| 2132 | * o When the sender re-starts from idle, it waits until it has | ||
| 2133 | * received ACKs for an entire flight of new data before making | ||
| 2134 | * a cwnd adjustment decision. The original Vegas implementation | ||
| 2135 | * assumed senders never went idle. | ||
| 2136 | */ | ||
| 2137 | static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) | ||
| 2138 | { | ||
| 2139 | /* The key players are v_beg_snd_una and v_beg_snd_nxt. | ||
| 2140 | * | ||
| 2141 | * These are so named because they represent the approximate values | ||
| 2142 | * of snd_una and snd_nxt at the beginning of the current RTT. More | ||
| 2143 | * precisely, they represent the amount of data sent during the RTT. | ||
| 2144 | * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, | ||
| 2145 | * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding | ||
| 2146 | * bytes of data have been ACKed during the course of the RTT, giving | ||
| 2147 | * an "actual" rate of: | ||
| 2148 | * | ||
| 2149 | * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) | ||
| 2150 | * | ||
| 2151 | * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, | ||
| 2152 | * because delayed ACKs can cover more than one segment, so they | ||
| 2153 | * don't line up nicely with the boundaries of RTTs. | ||
| 2154 | * | ||
| 2155 | * Another unfortunate fact of life is that delayed ACKs delay the | ||
| 2156 | * advance of the left edge of our send window, so that the number | ||
| 2157 | * of bytes we send in an RTT is often less than our cwnd will allow. | ||
| 2158 | * So we keep track of our cwnd separately, in v_beg_snd_cwnd. | ||
| 2159 | */ | ||
| 2160 | |||
| 2161 | if (after(ack, tp->vegas.beg_snd_nxt)) { | ||
| 2162 | /* Do the Vegas once-per-RTT cwnd adjustment. */ | ||
| 2163 | u32 old_wnd, old_snd_cwnd; | ||
| 2164 | |||
| 2165 | |||
| 2166 | /* Here old_wnd is essentially the window of data that was | ||
| 2167 | * sent during the previous RTT, and has all | ||
| 2168 | * been acknowledged in the course of the RTT that ended | ||
| 2169 | * with the ACK we just received. Likewise, old_snd_cwnd | ||
| 2170 | * is the cwnd during the previous RTT. | ||
| 2171 | */ | ||
| 2172 | old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / | ||
| 2173 | tp->mss_cache_std; | ||
| 2174 | old_snd_cwnd = tp->vegas.beg_snd_cwnd; | ||
| 2175 | |||
| 2176 | /* Save the extent of the current window so we can use this | ||
| 2177 | * at the end of the next RTT. | ||
| 2178 | */ | ||
| 2179 | tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt; | ||
| 2180 | tp->vegas.beg_snd_nxt = tp->snd_nxt; | ||
| 2181 | tp->vegas.beg_snd_cwnd = tp->snd_cwnd; | ||
| 2182 | |||
| 2183 | /* Take into account the current RTT sample too, to | ||
| 2184 | * decrease the impact of delayed acks. This double counts | ||
| 2185 | * this sample since we count it for the next window as well, | ||
| 2186 | * but that's not too awful, since we're taking the min, | ||
| 2187 | * rather than averaging. | ||
| 2188 | */ | ||
| 2189 | vegas_rtt_calc(tp, seq_rtt); | ||
| 2190 | |||
| 2191 | /* We do the Vegas calculations only if we got enough RTT | ||
| 2192 | * samples that we can be reasonably sure that we got | ||
| 2193 | * at least one RTT sample that wasn't from a delayed ACK. | ||
| 2194 | * If we only had 2 samples total, | ||
| 2195 | * then that means we're getting only 1 ACK per RTT, which | ||
| 2196 | * means they're almost certainly delayed ACKs. | ||
| 2197 | * If we have 3 samples, we should be OK. | ||
| 2198 | */ | ||
| 2199 | |||
| 2200 | if (tp->vegas.cntRTT <= 2) { | ||
| 2201 | /* We don't have enough RTT samples to do the Vegas | ||
| 2202 | * calculation, so we'll behave like Reno. | ||
| 2203 | */ | ||
| 2204 | if (tp->snd_cwnd > tp->snd_ssthresh) | ||
| 2205 | tp->snd_cwnd++; | ||
| 2206 | } else { | ||
| 2207 | u32 rtt, target_cwnd, diff; | ||
| 2208 | |||
| 2209 | /* We have enough RTT samples, so, using the Vegas | ||
| 2210 | * algorithm, we determine if we should increase or | ||
| 2211 | * decrease cwnd, and by how much. | ||
| 2212 | */ | ||
| 2213 | |||
| 2214 | /* Pluck out the RTT we are using for the Vegas | ||
| 2215 | * calculations. This is the min RTT seen during the | ||
| 2216 | * last RTT. Taking the min filters out the effects | ||
| 2217 | * of delayed ACKs, at the cost of noticing congestion | ||
| 2218 | * a bit later. | ||
| 2219 | */ | ||
| 2220 | rtt = tp->vegas.minRTT; | ||
| 2221 | |||
| 2222 | /* Calculate the cwnd we should have, if we weren't | ||
| 2223 | * going too fast. | ||
| 2224 | * | ||
| 2225 | * This is: | ||
| 2226 | * (actual rate in segments) * baseRTT | ||
| 2227 | * We keep it as a fixed point number with | ||
| 2228 | * V_PARAM_SHIFT bits to the right of the binary point. | ||
| 2229 | */ | ||
| 2230 | target_cwnd = ((old_wnd * tp->vegas.baseRTT) | ||
| 2231 | << V_PARAM_SHIFT) / rtt; | ||
| 2232 | |||
| 2233 | /* Calculate the difference between the window we had, | ||
| 2234 | * and the window we would like to have. This quantity | ||
| 2235 | * is the "Diff" from the Arizona Vegas papers. | ||
| 2236 | * | ||
| 2237 | * Again, this is a fixed point number with | ||
| 2238 | * V_PARAM_SHIFT bits to the right of the binary | ||
| 2239 | * point. | ||
| 2240 | */ | ||
| 2241 | diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; | ||
| 2242 | |||
| 2243 | if (tp->snd_cwnd < tp->snd_ssthresh) { | ||
| 2244 | /* Slow start. */ | ||
| 2245 | if (diff > sysctl_tcp_vegas_gamma) { | ||
| 2246 | /* Going too fast. Time to slow down | ||
| 2247 | * and switch to congestion avoidance. | ||
| 2248 | */ | ||
| 2249 | tp->snd_ssthresh = 2; | ||
| 2250 | |||
| 2251 | /* Set cwnd to match the actual rate | ||
| 2252 | * exactly: | ||
| 2253 | * cwnd = (actual rate) * baseRTT | ||
| 2254 | * Then we add 1 because the integer | ||
| 2255 | * truncation robs us of full link | ||
| 2256 | * utilization. | ||
| 2257 | */ | ||
| 2258 | tp->snd_cwnd = min(tp->snd_cwnd, | ||
| 2259 | (target_cwnd >> | ||
| 2260 | V_PARAM_SHIFT)+1); | ||
| 2261 | |||
| 2262 | } | ||
| 2263 | } else { | ||
| 2264 | /* Congestion avoidance. */ | ||
| 2265 | u32 next_snd_cwnd; | ||
| 2266 | |||
| 2267 | /* Figure out where we would like cwnd | ||
| 2268 | * to be. | ||
| 2269 | */ | ||
| 2270 | if (diff > sysctl_tcp_vegas_beta) { | ||
| 2271 | /* The old window was too fast, so | ||
| 2272 | * we slow down. | ||
| 2273 | */ | ||
| 2274 | next_snd_cwnd = old_snd_cwnd - 1; | ||
| 2275 | } else if (diff < sysctl_tcp_vegas_alpha) { | ||
| 2276 | /* We don't have enough extra packets | ||
| 2277 | * in the network, so speed up. | ||
| 2278 | */ | ||
| 2279 | next_snd_cwnd = old_snd_cwnd + 1; | ||
| 2280 | } else { | ||
| 2281 | /* Sending just as fast as we | ||
| 2282 | * should be. | ||
| 2283 | */ | ||
| 2284 | next_snd_cwnd = old_snd_cwnd; | ||
| 2285 | } | ||
| 2286 | |||
| 2287 | /* Adjust cwnd upward or downward, toward the | ||
| 2288 | * desired value. | ||
| 2289 | */ | ||
| 2290 | if (next_snd_cwnd > tp->snd_cwnd) | ||
| 2291 | tp->snd_cwnd++; | ||
| 2292 | else if (next_snd_cwnd < tp->snd_cwnd) | ||
| 2293 | tp->snd_cwnd--; | ||
| 2294 | } | ||
| 2295 | } | ||
| 2296 | |||
| 2297 | /* Wipe the slate clean for the next RTT. */ | ||
| 2298 | tp->vegas.cntRTT = 0; | ||
| 2299 | tp->vegas.minRTT = 0x7fffffff; | ||
| 2300 | } | ||
| 2301 | |||
| 2302 | /* The following code is executed for every ack we receive, | ||
| 2303 | * except for conditions checked in should_advance_cwnd() | ||
| 2304 | * before the call to tcp_cong_avoid(). Mainly this means that | ||
| 2305 | * we only execute this code if the ack actually acked some | ||
| 2306 | * data. | ||
| 2307 | */ | ||
| 2308 | |||
| 2309 | /* If we are in slow start, increase our cwnd in response to this ACK. | ||
| 2310 | * (If we are not in slow start then we are in congestion avoidance, | ||
| 2311 | * and adjust our congestion window only once per RTT. See the code | ||
| 2312 | * above.) | ||
| 2313 | */ | ||
| 2314 | if (tp->snd_cwnd <= tp->snd_ssthresh) | ||
| 2315 | tp->snd_cwnd++; | ||
| 2316 | |||
| 2317 | /* to keep cwnd from growing without bound */ | ||
| 2318 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
| 2319 | |||
| 2320 | /* Make sure that we are never so timid as to reduce our cwnd below | ||
| 2321 | * 2 MSS. | ||
| 2322 | * | ||
| 2323 | * Going below 2 MSS would risk huge delayed ACKs from our receiver. | ||
| 2324 | */ | ||
| 2325 | tp->snd_cwnd = max(tp->snd_cwnd, 2U); | ||
| 2326 | |||
| 2327 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
| 2328 | } | ||
| 2329 | |||
| 2330 | static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) | ||
| 2331 | { | ||
| 2332 | if (tcp_vegas_enabled(tp)) | ||
| 2333 | vegas_cong_avoid(tp, ack, seq_rtt); | ||
| 2334 | else | ||
| 2335 | reno_cong_avoid(tp); | ||
| 2336 | } | ||
| 2337 | |||
| 2338 | /* Restart timer after forward progress on connection. | 1947 | /* Restart timer after forward progress on connection. |
| 2339 | * RFC2988 recommends to restart timer to now+rto. | 1948 | * RFC2988 recommends to restart timer to now+rto. |
| 2340 | */ | 1949 | */ |
| @@ -2415,13 +2024,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, | |||
| 2415 | 2024 | ||
| 2416 | 2025 | ||
| 2417 | /* Remove acknowledged frames from the retransmission queue. */ | 2026 | /* Remove acknowledged frames from the retransmission queue. */ |
| 2418 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | 2027 | static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) |
| 2419 | { | 2028 | { |
| 2420 | struct tcp_sock *tp = tcp_sk(sk); | 2029 | struct tcp_sock *tp = tcp_sk(sk); |
| 2421 | struct sk_buff *skb; | 2030 | struct sk_buff *skb; |
| 2422 | __u32 now = tcp_time_stamp; | 2031 | __u32 now = tcp_time_stamp; |
| 2423 | int acked = 0; | 2032 | int acked = 0; |
| 2424 | __s32 seq_rtt = -1; | 2033 | __s32 seq_rtt = -1; |
| 2034 | struct timeval usnow; | ||
| 2035 | u32 pkts_acked = 0; | ||
| 2036 | |||
| 2037 | if (seq_usrtt) | ||
| 2038 | do_gettimeofday(&usnow); | ||
| 2425 | 2039 | ||
| 2426 | while ((skb = skb_peek(&sk->sk_write_queue)) && | 2040 | while ((skb = skb_peek(&sk->sk_write_queue)) && |
| 2427 | skb != sk->sk_send_head) { | 2041 | skb != sk->sk_send_head) { |
| @@ -2448,6 +2062,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
| 2448 | */ | 2062 | */ |
| 2449 | if (!(scb->flags & TCPCB_FLAG_SYN)) { | 2063 | if (!(scb->flags & TCPCB_FLAG_SYN)) { |
| 2450 | acked |= FLAG_DATA_ACKED; | 2064 | acked |= FLAG_DATA_ACKED; |
| 2065 | ++pkts_acked; | ||
| 2451 | } else { | 2066 | } else { |
| 2452 | acked |= FLAG_SYN_ACKED; | 2067 | acked |= FLAG_SYN_ACKED; |
| 2453 | tp->retrans_stamp = 0; | 2068 | tp->retrans_stamp = 0; |
| @@ -2461,6 +2076,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
| 2461 | seq_rtt = -1; | 2076 | seq_rtt = -1; |
| 2462 | } else if (seq_rtt < 0) | 2077 | } else if (seq_rtt < 0) |
| 2463 | seq_rtt = now - scb->when; | 2078 | seq_rtt = now - scb->when; |
| 2079 | if (seq_usrtt) | ||
| 2080 | *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 | ||
| 2081 | + (usnow.tv_usec - skb->stamp.tv_usec); | ||
| 2082 | |||
| 2464 | if (sacked & TCPCB_SACKED_ACKED) | 2083 | if (sacked & TCPCB_SACKED_ACKED) |
| 2465 | tp->sacked_out -= tcp_skb_pcount(skb); | 2084 | tp->sacked_out -= tcp_skb_pcount(skb); |
| 2466 | if (sacked & TCPCB_LOST) | 2085 | if (sacked & TCPCB_LOST) |
| @@ -2479,8 +2098,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) | |||
| 2479 | } | 2098 | } |
| 2480 | 2099 | ||
| 2481 | if (acked&FLAG_ACKED) { | 2100 | if (acked&FLAG_ACKED) { |
| 2482 | tcp_ack_update_rtt(tp, acked, seq_rtt); | 2101 | tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); |
| 2483 | tcp_ack_packets_out(sk, tp); | 2102 | tcp_ack_packets_out(sk, tp); |
| 2103 | |||
| 2104 | if (tp->ca_ops->pkts_acked) | ||
| 2105 | tp->ca_ops->pkts_acked(tp, pkts_acked); | ||
| 2484 | } | 2106 | } |
| 2485 | 2107 | ||
| 2486 | #if FASTRETRANS_DEBUG > 0 | 2108 | #if FASTRETRANS_DEBUG > 0 |
| @@ -2624,257 +2246,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) | |||
| 2624 | tp->frto_counter = (tp->frto_counter + 1) % 3; | 2246 | tp->frto_counter = (tp->frto_counter + 1) % 3; |
| 2625 | } | 2247 | } |
| 2626 | 2248 | ||
| 2627 | /* | ||
| 2628 | * TCP Westwood+ | ||
| 2629 | */ | ||
| 2630 | |||
| 2631 | /* | ||
| 2632 | * @init_westwood | ||
| 2633 | * This function initializes fields used in TCP Westwood+. We can't | ||
| 2634 | * get no information about RTTmin at this time so we simply set it to | ||
| 2635 | * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative | ||
| 2636 | * since in this way we're sure it will be updated in a consistent | ||
| 2637 | * way as soon as possible. It will reasonably happen within the first | ||
| 2638 | * RTT period of the connection lifetime. | ||
| 2639 | */ | ||
| 2640 | |||
| 2641 | static void init_westwood(struct sock *sk) | ||
| 2642 | { | ||
| 2643 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2644 | |||
| 2645 | tp->westwood.bw_ns_est = 0; | ||
| 2646 | tp->westwood.bw_est = 0; | ||
| 2647 | tp->westwood.accounted = 0; | ||
| 2648 | tp->westwood.cumul_ack = 0; | ||
| 2649 | tp->westwood.rtt_win_sx = tcp_time_stamp; | ||
| 2650 | tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT; | ||
| 2651 | tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT; | ||
| 2652 | tp->westwood.snd_una = tp->snd_una; | ||
| 2653 | } | ||
| 2654 | |||
| 2655 | /* | ||
| 2656 | * @westwood_do_filter | ||
| 2657 | * Low-pass filter. Implemented using constant coeffients. | ||
| 2658 | */ | ||
| 2659 | |||
| 2660 | static inline __u32 westwood_do_filter(__u32 a, __u32 b) | ||
| 2661 | { | ||
| 2662 | return (((7 * a) + b) >> 3); | ||
| 2663 | } | ||
| 2664 | |||
| 2665 | static void westwood_filter(struct sock *sk, __u32 delta) | ||
| 2666 | { | ||
| 2667 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2668 | |||
| 2669 | tp->westwood.bw_ns_est = | ||
| 2670 | westwood_do_filter(tp->westwood.bw_ns_est, | ||
| 2671 | tp->westwood.bk / delta); | ||
| 2672 | tp->westwood.bw_est = | ||
| 2673 | westwood_do_filter(tp->westwood.bw_est, | ||
| 2674 | tp->westwood.bw_ns_est); | ||
| 2675 | } | ||
| 2676 | |||
| 2677 | /* | ||
| 2678 | * @westwood_update_rttmin | ||
| 2679 | * It is used to update RTTmin. In this case we MUST NOT use | ||
| 2680 | * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN! | ||
| 2681 | */ | ||
| 2682 | |||
| 2683 | static inline __u32 westwood_update_rttmin(const struct sock *sk) | ||
| 2684 | { | ||
| 2685 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 2686 | __u32 rttmin = tp->westwood.rtt_min; | ||
| 2687 | |||
| 2688 | if (tp->westwood.rtt != 0 && | ||
| 2689 | (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin)) | ||
| 2690 | rttmin = tp->westwood.rtt; | ||
| 2691 | |||
| 2692 | return rttmin; | ||
| 2693 | } | ||
| 2694 | |||
| 2695 | /* | ||
| 2696 | * @westwood_acked | ||
| 2697 | * Evaluate increases for dk. | ||
| 2698 | */ | ||
| 2699 | |||
| 2700 | static inline __u32 westwood_acked(const struct sock *sk) | ||
| 2701 | { | ||
| 2702 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 2703 | |||
| 2704 | return tp->snd_una - tp->westwood.snd_una; | ||
| 2705 | } | ||
| 2706 | |||
| 2707 | /* | ||
| 2708 | * @westwood_new_window | ||
| 2709 | * It evaluates if we are receiving data inside the same RTT window as | ||
| 2710 | * when we started. | ||
| 2711 | * Return value: | ||
| 2712 | * It returns 0 if we are still evaluating samples in the same RTT | ||
| 2713 | * window, 1 if the sample has to be considered in the next window. | ||
| 2714 | */ | ||
| 2715 | |||
| 2716 | static int westwood_new_window(const struct sock *sk) | ||
| 2717 | { | ||
| 2718 | const struct tcp_sock *tp = tcp_sk(sk); | ||
| 2719 | __u32 left_bound; | ||
| 2720 | __u32 rtt; | ||
| 2721 | int ret = 0; | ||
| 2722 | |||
| 2723 | left_bound = tp->westwood.rtt_win_sx; | ||
| 2724 | rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN); | ||
| 2725 | |||
| 2726 | /* | ||
| 2727 | * A RTT-window has passed. Be careful since if RTT is less than | ||
| 2728 | * 50ms we don't filter but we continue 'building the sample'. | ||
| 2729 | * This minimum limit was choosen since an estimation on small | ||
| 2730 | * time intervals is better to avoid... | ||
| 2731 | * Obvioulsy on a LAN we reasonably will always have | ||
| 2732 | * right_bound = left_bound + WESTWOOD_RTT_MIN | ||
| 2733 | */ | ||
| 2734 | |||
| 2735 | if ((left_bound + rtt) < tcp_time_stamp) | ||
| 2736 | ret = 1; | ||
| 2737 | |||
| 2738 | return ret; | ||
| 2739 | } | ||
| 2740 | |||
| 2741 | /* | ||
| 2742 | * @westwood_update_window | ||
| 2743 | * It updates RTT evaluation window if it is the right moment to do | ||
| 2744 | * it. If so it calls filter for evaluating bandwidth. | ||
| 2745 | */ | ||
| 2746 | |||
| 2747 | static void __westwood_update_window(struct sock *sk, __u32 now) | ||
| 2748 | { | ||
| 2749 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2750 | __u32 delta = now - tp->westwood.rtt_win_sx; | ||
| 2751 | |||
| 2752 | if (delta) { | ||
| 2753 | if (tp->westwood.rtt) | ||
| 2754 | westwood_filter(sk, delta); | ||
| 2755 | |||
| 2756 | tp->westwood.bk = 0; | ||
| 2757 | tp->westwood.rtt_win_sx = tcp_time_stamp; | ||
| 2758 | } | ||
| 2759 | } | ||
| 2760 | |||
| 2761 | |||
| 2762 | static void westwood_update_window(struct sock *sk, __u32 now) | ||
| 2763 | { | ||
| 2764 | if (westwood_new_window(sk)) | ||
| 2765 | __westwood_update_window(sk, now); | ||
| 2766 | } | ||
| 2767 | |||
| 2768 | /* | ||
| 2769 | * @__tcp_westwood_fast_bw | ||
| 2770 | * It is called when we are in fast path. In particular it is called when | ||
| 2771 | * header prediction is successfull. In such case infact update is | ||
| 2772 | * straight forward and doesn't need any particular care. | ||
| 2773 | */ | ||
| 2774 | |||
| 2775 | static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) | ||
| 2776 | { | ||
| 2777 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2778 | |||
| 2779 | westwood_update_window(sk, tcp_time_stamp); | ||
| 2780 | |||
| 2781 | tp->westwood.bk += westwood_acked(sk); | ||
| 2782 | tp->westwood.snd_una = tp->snd_una; | ||
| 2783 | tp->westwood.rtt_min = westwood_update_rttmin(sk); | ||
| 2784 | } | ||
| 2785 | |||
| 2786 | static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) | ||
| 2787 | { | ||
| 2788 | if (tcp_is_westwood(tcp_sk(sk))) | ||
| 2789 | __tcp_westwood_fast_bw(sk, skb); | ||
| 2790 | } | ||
| 2791 | |||
| 2792 | |||
| 2793 | /* | ||
| 2794 | * @westwood_dupack_update | ||
| 2795 | * It updates accounted and cumul_ack when receiving a dupack. | ||
| 2796 | */ | ||
| 2797 | |||
| 2798 | static void westwood_dupack_update(struct sock *sk) | ||
| 2799 | { | ||
| 2800 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2801 | |||
| 2802 | tp->westwood.accounted += tp->mss_cache_std; | ||
| 2803 | tp->westwood.cumul_ack = tp->mss_cache_std; | ||
| 2804 | } | ||
| 2805 | |||
| 2806 | static inline int westwood_may_change_cumul(struct tcp_sock *tp) | ||
| 2807 | { | ||
| 2808 | return (tp->westwood.cumul_ack > tp->mss_cache_std); | ||
| 2809 | } | ||
| 2810 | |||
| 2811 | static inline void westwood_partial_update(struct tcp_sock *tp) | ||
| 2812 | { | ||
| 2813 | tp->westwood.accounted -= tp->westwood.cumul_ack; | ||
| 2814 | tp->westwood.cumul_ack = tp->mss_cache_std; | ||
| 2815 | } | ||
| 2816 | |||
| 2817 | static inline void westwood_complete_update(struct tcp_sock *tp) | ||
| 2818 | { | ||
| 2819 | tp->westwood.cumul_ack -= tp->westwood.accounted; | ||
| 2820 | tp->westwood.accounted = 0; | ||
| 2821 | } | ||
| 2822 | |||
| 2823 | /* | ||
| 2824 | * @westwood_acked_count | ||
| 2825 | * This function evaluates cumul_ack for evaluating dk in case of | ||
| 2826 | * delayed or partial acks. | ||
| 2827 | */ | ||
| 2828 | |||
| 2829 | static inline __u32 westwood_acked_count(struct sock *sk) | ||
| 2830 | { | ||
| 2831 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2832 | |||
| 2833 | tp->westwood.cumul_ack = westwood_acked(sk); | ||
| 2834 | |||
| 2835 | /* If cumul_ack is 0 this is a dupack since it's not moving | ||
| 2836 | * tp->snd_una. | ||
| 2837 | */ | ||
| 2838 | if (!(tp->westwood.cumul_ack)) | ||
| 2839 | westwood_dupack_update(sk); | ||
| 2840 | |||
| 2841 | if (westwood_may_change_cumul(tp)) { | ||
| 2842 | /* Partial or delayed ack */ | ||
| 2843 | if (tp->westwood.accounted >= tp->westwood.cumul_ack) | ||
| 2844 | westwood_partial_update(tp); | ||
| 2845 | else | ||
| 2846 | westwood_complete_update(tp); | ||
| 2847 | } | ||
| 2848 | |||
| 2849 | tp->westwood.snd_una = tp->snd_una; | ||
| 2850 | |||
| 2851 | return tp->westwood.cumul_ack; | ||
| 2852 | } | ||
| 2853 | |||
| 2854 | |||
| 2855 | /* | ||
| 2856 | * @__tcp_westwood_slow_bw | ||
| 2857 | * It is called when something is going wrong..even if there could | ||
| 2858 | * be no problems! Infact a simple delayed packet may trigger a | ||
| 2859 | * dupack. But we need to be careful in such case. | ||
| 2860 | */ | ||
| 2861 | |||
| 2862 | static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) | ||
| 2863 | { | ||
| 2864 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2865 | |||
| 2866 | westwood_update_window(sk, tcp_time_stamp); | ||
| 2867 | |||
| 2868 | tp->westwood.bk += westwood_acked_count(sk); | ||
| 2869 | tp->westwood.rtt_min = westwood_update_rttmin(sk); | ||
| 2870 | } | ||
| 2871 | |||
| 2872 | static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) | ||
| 2873 | { | ||
| 2874 | if (tcp_is_westwood(tcp_sk(sk))) | ||
| 2875 | __tcp_westwood_slow_bw(sk, skb); | ||
| 2876 | } | ||
| 2877 | |||
| 2878 | /* This routine deals with incoming acks, but not outgoing ones. */ | 2249 | /* This routine deals with incoming acks, but not outgoing ones. */ |
| 2879 | static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | 2250 | static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) |
| 2880 | { | 2251 | { |
| @@ -2884,6 +2255,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2884 | u32 ack = TCP_SKB_CB(skb)->ack_seq; | 2255 | u32 ack = TCP_SKB_CB(skb)->ack_seq; |
| 2885 | u32 prior_in_flight; | 2256 | u32 prior_in_flight; |
| 2886 | s32 seq_rtt; | 2257 | s32 seq_rtt; |
| 2258 | s32 seq_usrtt = 0; | ||
| 2887 | int prior_packets; | 2259 | int prior_packets; |
| 2888 | 2260 | ||
| 2889 | /* If the ack is newer than sent or older than previous acks | 2261 | /* If the ack is newer than sent or older than previous acks |
| @@ -2902,9 +2274,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2902 | */ | 2274 | */ |
| 2903 | tcp_update_wl(tp, ack, ack_seq); | 2275 | tcp_update_wl(tp, ack, ack_seq); |
| 2904 | tp->snd_una = ack; | 2276 | tp->snd_una = ack; |
| 2905 | tcp_westwood_fast_bw(sk, skb); | ||
| 2906 | flag |= FLAG_WIN_UPDATE; | 2277 | flag |= FLAG_WIN_UPDATE; |
| 2907 | 2278 | ||
| 2279 | tcp_ca_event(tp, CA_EVENT_FAST_ACK); | ||
| 2280 | |||
| 2908 | NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); | 2281 | NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); |
| 2909 | } else { | 2282 | } else { |
| 2910 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) | 2283 | if (ack_seq != TCP_SKB_CB(skb)->end_seq) |
| @@ -2920,7 +2293,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2920 | if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) | 2293 | if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) |
| 2921 | flag |= FLAG_ECE; | 2294 | flag |= FLAG_ECE; |
| 2922 | 2295 | ||
| 2923 | tcp_westwood_slow_bw(sk,skb); | 2296 | tcp_ca_event(tp, CA_EVENT_SLOW_ACK); |
| 2924 | } | 2297 | } |
| 2925 | 2298 | ||
| 2926 | /* We passed data and got it acked, remove any soft error | 2299 | /* We passed data and got it acked, remove any soft error |
| @@ -2935,22 +2308,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) | |||
| 2935 | prior_in_flight = tcp_packets_in_flight(tp); | 2308 | prior_in_flight = tcp_packets_in_flight(tp); |
| 2936 | 2309 | ||
| 2937 | /* See if we can take anything off of the retransmit queue. */ | 2310 | /* See if we can take anything off of the retransmit queue. */ |
| 2938 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt); | 2311 | flag |= tcp_clean_rtx_queue(sk, &seq_rtt, |
| 2312 | tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); | ||
| 2939 | 2313 | ||
| 2940 | if (tp->frto_counter) | 2314 | if (tp->frto_counter) |
| 2941 | tcp_process_frto(sk, prior_snd_una); | 2315 | tcp_process_frto(sk, prior_snd_una); |
| 2942 | 2316 | ||
| 2943 | if (tcp_ack_is_dubious(tp, flag)) { | 2317 | if (tcp_ack_is_dubious(tp, flag)) { |
| 2944 | /* Advanve CWND, if state allows this. */ | 2318 | /* Advanve CWND, if state allows this. */ |
| 2945 | if ((flag & FLAG_DATA_ACKED) && | 2319 | if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) |
| 2946 | (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && | 2320 | tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); |
| 2947 | tcp_may_raise_cwnd(tp, flag)) | ||
| 2948 | tcp_cong_avoid(tp, ack, seq_rtt); | ||
| 2949 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); | 2321 | tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); |
| 2950 | } else { | 2322 | } else { |
| 2951 | if ((flag & FLAG_DATA_ACKED) && | 2323 | if ((flag & FLAG_DATA_ACKED)) |
| 2952 | (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) | 2324 | tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); |
| 2953 | tcp_cong_avoid(tp, ack, seq_rtt); | ||
| 2954 | } | 2325 | } |
| 2955 | 2326 | ||
| 2956 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) | 2327 | if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) |
| @@ -4552,6 +3923,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4552 | 3923 | ||
| 4553 | tcp_init_metrics(sk); | 3924 | tcp_init_metrics(sk); |
| 4554 | 3925 | ||
| 3926 | tcp_init_congestion_control(tp); | ||
| 3927 | |||
| 4555 | /* Prevent spurious tcp_cwnd_restart() on first data | 3928 | /* Prevent spurious tcp_cwnd_restart() on first data |
| 4556 | * packet. | 3929 | * packet. |
| 4557 | */ | 3930 | */ |
| @@ -4708,9 +4081,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4708 | if(tp->af_specific->conn_request(sk, skb) < 0) | 4081 | if(tp->af_specific->conn_request(sk, skb) < 0) |
| 4709 | return 1; | 4082 | return 1; |
| 4710 | 4083 | ||
| 4711 | init_westwood(sk); | ||
| 4712 | init_bictcp(tp); | ||
| 4713 | |||
| 4714 | /* Now we have several options: In theory there is | 4084 | /* Now we have several options: In theory there is |
| 4715 | * nothing else in the frame. KA9Q has an option to | 4085 | * nothing else in the frame. KA9Q has an option to |
| 4716 | * send data with the syn, BSD accepts data with the | 4086 | * send data with the syn, BSD accepts data with the |
| @@ -4732,9 +4102,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4732 | goto discard; | 4102 | goto discard; |
| 4733 | 4103 | ||
| 4734 | case TCP_SYN_SENT: | 4104 | case TCP_SYN_SENT: |
| 4735 | init_westwood(sk); | ||
| 4736 | init_bictcp(tp); | ||
| 4737 | |||
| 4738 | queued = tcp_rcv_synsent_state_process(sk, skb, th, len); | 4105 | queued = tcp_rcv_synsent_state_process(sk, skb, th, len); |
| 4739 | if (queued >= 0) | 4106 | if (queued >= 0) |
| 4740 | return queued; | 4107 | return queued; |
| @@ -4816,7 +4183,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4816 | */ | 4183 | */ |
| 4817 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 4184 | if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
| 4818 | !tp->srtt) | 4185 | !tp->srtt) |
| 4819 | tcp_ack_saw_tstamp(tp, 0); | 4186 | tcp_ack_saw_tstamp(tp, 0, 0); |
| 4820 | 4187 | ||
| 4821 | if (tp->rx_opt.tstamp_ok) | 4188 | if (tp->rx_opt.tstamp_ok) |
| 4822 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; | 4189 | tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; |
| @@ -4828,6 +4195,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |||
| 4828 | 4195 | ||
| 4829 | tcp_init_metrics(sk); | 4196 | tcp_init_metrics(sk); |
| 4830 | 4197 | ||
| 4198 | tcp_init_congestion_control(tp); | ||
| 4199 | |||
| 4831 | /* Prevent spurious tcp_cwnd_restart() on | 4200 | /* Prevent spurious tcp_cwnd_restart() on |
| 4832 | * first data packet. | 4201 | * first data packet. |
| 4833 | */ | 4202 | */ |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2d41d5d6ad19..9122814c13ad 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
| @@ -2048,6 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk) | |||
| 2048 | tp->mss_cache_std = tp->mss_cache = 536; | 2048 | tp->mss_cache_std = tp->mss_cache = 536; |
| 2049 | 2049 | ||
| 2050 | tp->reordering = sysctl_tcp_reordering; | 2050 | tp->reordering = sysctl_tcp_reordering; |
| 2051 | tp->ca_ops = &tcp_reno; | ||
| 2051 | 2052 | ||
| 2052 | sk->sk_state = TCP_CLOSE; | 2053 | sk->sk_state = TCP_CLOSE; |
| 2053 | 2054 | ||
| @@ -2070,6 +2071,8 @@ int tcp_v4_destroy_sock(struct sock *sk) | |||
| 2070 | 2071 | ||
| 2071 | tcp_clear_xmit_timers(sk); | 2072 | tcp_clear_xmit_timers(sk); |
| 2072 | 2073 | ||
| 2074 | tcp_cleanup_congestion_control(tp); | ||
| 2075 | |||
| 2073 | /* Cleanup up the write buffer. */ | 2076 | /* Cleanup up the write buffer. */ |
| 2074 | sk_stream_writequeue_purge(sk); | 2077 | sk_stream_writequeue_purge(sk); |
| 2075 | 2078 | ||
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b3943e7562f3..f42a284164b7 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c | |||
| @@ -774,6 +774,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
| 774 | newtp->frto_counter = 0; | 774 | newtp->frto_counter = 0; |
| 775 | newtp->frto_highmark = 0; | 775 | newtp->frto_highmark = 0; |
| 776 | 776 | ||
| 777 | newtp->ca_ops = &tcp_reno; | ||
| 778 | |||
| 777 | tcp_set_ca_state(newtp, TCP_CA_Open); | 779 | tcp_set_ca_state(newtp, TCP_CA_Open); |
| 778 | tcp_init_xmit_timers(newsk); | 780 | tcp_init_xmit_timers(newsk); |
| 779 | skb_queue_head_init(&newtp->out_of_order_queue); | 781 | skb_queue_head_init(&newtp->out_of_order_queue); |
| @@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |||
| 842 | if (newtp->ecn_flags&TCP_ECN_OK) | 844 | if (newtp->ecn_flags&TCP_ECN_OK) |
| 843 | sock_set_flag(newsk, SOCK_NO_LARGESEND); | 845 | sock_set_flag(newsk, SOCK_NO_LARGESEND); |
| 844 | 846 | ||
| 845 | tcp_ca_init(newtp); | ||
| 846 | |||
| 847 | TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); | 847 | TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); |
| 848 | } | 848 | } |
| 849 | return newsk; | 849 | return newsk; |
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f17c6577e337..0e17c244875c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
| @@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) | |||
| 111 | u32 restart_cwnd = tcp_init_cwnd(tp, dst); | 111 | u32 restart_cwnd = tcp_init_cwnd(tp, dst); |
| 112 | u32 cwnd = tp->snd_cwnd; | 112 | u32 cwnd = tp->snd_cwnd; |
| 113 | 113 | ||
| 114 | if (tcp_is_vegas(tp)) | 114 | tcp_ca_event(tp, CA_EVENT_CWND_RESTART); |
| 115 | tcp_vegas_enable(tp); | ||
| 116 | 115 | ||
| 117 | tp->snd_ssthresh = tcp_current_ssthresh(tp); | 116 | tp->snd_ssthresh = tcp_current_ssthresh(tp); |
| 118 | restart_cwnd = min(restart_cwnd, cwnd); | 117 | restart_cwnd = min(restart_cwnd, cwnd); |
| @@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 280 | #define SYSCTL_FLAG_WSCALE 0x2 | 279 | #define SYSCTL_FLAG_WSCALE 0x2 |
| 281 | #define SYSCTL_FLAG_SACK 0x4 | 280 | #define SYSCTL_FLAG_SACK 0x4 |
| 282 | 281 | ||
| 282 | /* If congestion control is doing timestamping */ | ||
| 283 | if (tp->ca_ops->rtt_sample) | ||
| 284 | do_gettimeofday(&skb->stamp); | ||
| 285 | |||
| 283 | sysctl_flags = 0; | 286 | sysctl_flags = 0; |
| 284 | if (tcb->flags & TCPCB_FLAG_SYN) { | 287 | if (tcb->flags & TCPCB_FLAG_SYN) { |
| 285 | tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; | 288 | tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; |
| @@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 304 | (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); | 307 | (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); |
| 305 | } | 308 | } |
| 306 | 309 | ||
| 307 | /* | 310 | if (tcp_packets_in_flight(tp) == 0) |
| 308 | * If the connection is idle and we are restarting, | 311 | tcp_ca_event(tp, CA_EVENT_TX_START); |
| 309 | * then we don't want to do any Vegas calculations | ||
| 310 | * until we get fresh RTT samples. So when we | ||
| 311 | * restart, we reset our Vegas state to a clean | ||
| 312 | * slate. After we get acks for this flight of | ||
| 313 | * packets, _then_ we can make Vegas calculations | ||
| 314 | * again. | ||
| 315 | */ | ||
| 316 | if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0) | ||
| 317 | tcp_vegas_enable(tp); | ||
| 318 | 312 | ||
| 319 | th = (struct tcphdr *) skb_push(skb, tcp_header_size); | 313 | th = (struct tcphdr *) skb_push(skb, tcp_header_size); |
| 320 | skb->h.th = th; | 314 | skb->h.th = th; |
| @@ -521,6 +515,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) | |||
| 521 | * skbs, which it never sent before. --ANK | 515 | * skbs, which it never sent before. --ANK |
| 522 | */ | 516 | */ |
| 523 | TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; | 517 | TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; |
| 518 | buff->stamp = skb->stamp; | ||
| 524 | 519 | ||
| 525 | if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { | 520 | if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { |
| 526 | tp->lost_out -= tcp_skb_pcount(skb); | 521 | tp->lost_out -= tcp_skb_pcount(skb); |
| @@ -1449,7 +1444,6 @@ static inline void tcp_connect_init(struct sock *sk) | |||
| 1449 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); | 1444 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); |
| 1450 | tp->advmss = dst_metric(dst, RTAX_ADVMSS); | 1445 | tp->advmss = dst_metric(dst, RTAX_ADVMSS); |
| 1451 | tcp_initialize_rcv_mss(sk); | 1446 | tcp_initialize_rcv_mss(sk); |
| 1452 | tcp_ca_init(tp); | ||
| 1453 | 1447 | ||
| 1454 | tcp_select_initial_window(tcp_full_space(sk), | 1448 | tcp_select_initial_window(tcp_full_space(sk), |
| 1455 | tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), | 1449 | tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), |
| @@ -1503,7 +1497,6 @@ int tcp_connect(struct sock *sk) | |||
| 1503 | TCP_SKB_CB(buff)->end_seq = tp->write_seq; | 1497 | TCP_SKB_CB(buff)->end_seq = tp->write_seq; |
| 1504 | tp->snd_nxt = tp->write_seq; | 1498 | tp->snd_nxt = tp->write_seq; |
| 1505 | tp->pushed_seq = tp->write_seq; | 1499 | tp->pushed_seq = tp->write_seq; |
| 1506 | tcp_ca_init(tp); | ||
| 1507 | 1500 | ||
| 1508 | /* Send it off. */ | 1501 | /* Send it off. */ |
| 1509 | TCP_SKB_CB(buff)->when = tcp_time_stamp; | 1502 | TCP_SKB_CB(buff)->when = tcp_time_stamp; |
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c new file mode 100644 index 000000000000..70e108e15c71 --- /dev/null +++ b/net/ipv4/tcp_scalable.c | |||
| @@ -0,0 +1,68 @@ | |||
| 1 | /* Tom Kelly's Scalable TCP | ||
| 2 | * | ||
| 3 | * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/ | ||
| 4 | * | ||
| 5 | * John Heffner <jheffner@sc.edu> | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include <linux/config.h> | ||
| 9 | #include <linux/module.h> | ||
| 10 | #include <net/tcp.h> | ||
| 11 | |||
| 12 | /* These factors derived from the recommended values in the aer: | ||
| 13 | * .01 and and 7/8. We use 50 instead of 100 to account for | ||
| 14 | * delayed ack. | ||
| 15 | */ | ||
| 16 | #define TCP_SCALABLE_AI_CNT 50U | ||
| 17 | #define TCP_SCALABLE_MD_SCALE 3 | ||
| 18 | |||
| 19 | static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, | ||
| 20 | u32 in_flight, int flag) | ||
| 21 | { | ||
| 22 | if (in_flight < tp->snd_cwnd) | ||
| 23 | return; | ||
| 24 | |||
| 25 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | ||
| 26 | tp->snd_cwnd++; | ||
| 27 | } else { | ||
| 28 | tp->snd_cwnd_cnt++; | ||
| 29 | if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ | ||
| 30 | tp->snd_cwnd++; | ||
| 31 | tp->snd_cwnd_cnt = 0; | ||
| 32 | } | ||
| 33 | } | ||
| 34 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
| 35 | tp->snd_cwnd_stamp = tcp_time_stamp; | ||
| 36 | } | ||
| 37 | |||
| 38 | static u32 tcp_scalable_ssthresh(struct tcp_sock *tp) | ||
| 39 | { | ||
| 40 | return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); | ||
| 41 | } | ||
| 42 | |||
| 43 | |||
| 44 | static struct tcp_congestion_ops tcp_scalable = { | ||
| 45 | .ssthresh = tcp_scalable_ssthresh, | ||
| 46 | .cong_avoid = tcp_scalable_cong_avoid, | ||
| 47 | .min_cwnd = tcp_reno_min_cwnd, | ||
| 48 | |||
| 49 | .owner = THIS_MODULE, | ||
| 50 | .name = "scalable", | ||
| 51 | }; | ||
| 52 | |||
| 53 | static int __init tcp_scalable_register(void) | ||
| 54 | { | ||
| 55 | return tcp_register_congestion_control(&tcp_scalable); | ||
| 56 | } | ||
| 57 | |||
| 58 | static void __exit tcp_scalable_unregister(void) | ||
| 59 | { | ||
| 60 | tcp_unregister_congestion_control(&tcp_scalable); | ||
| 61 | } | ||
| 62 | |||
| 63 | module_init(tcp_scalable_register); | ||
| 64 | module_exit(tcp_scalable_unregister); | ||
| 65 | |||
| 66 | MODULE_AUTHOR("John Heffner"); | ||
| 67 | MODULE_LICENSE("GPL"); | ||
| 68 | MODULE_DESCRIPTION("Scalable TCP"); | ||
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c new file mode 100644 index 000000000000..9bd443db5193 --- /dev/null +++ b/net/ipv4/tcp_vegas.c | |||
| @@ -0,0 +1,411 @@ | |||
| 1 | /* | ||
| 2 | * TCP Vegas congestion control | ||
| 3 | * | ||
| 4 | * This is based on the congestion detection/avoidance scheme described in | ||
| 5 | * Lawrence S. Brakmo and Larry L. Peterson. | ||
| 6 | * "TCP Vegas: End to end congestion avoidance on a global internet." | ||
| 7 | * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, | ||
| 8 | * October 1995. Available from: | ||
| 9 | * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps | ||
| 10 | * | ||
| 11 | * See http://www.cs.arizona.edu/xkernel/ for their implementation. | ||
| 12 | * The main aspects that distinguish this implementation from the | ||
| 13 | * Arizona Vegas implementation are: | ||
| 14 | * o We do not change the loss detection or recovery mechanisms of | ||
| 15 | * Linux in any way. Linux already recovers from losses quite well, | ||
| 16 | * using fine-grained timers, NewReno, and FACK. | ||
| 17 | * o To avoid the performance penalty imposed by increasing cwnd | ||
| 18 | * only every-other RTT during slow start, we increase during | ||
| 19 | * every RTT during slow start, just like Reno. | ||
| 20 | * o Largely to allow continuous cwnd growth during slow start, | ||
| 21 | * we use the rate at which ACKs come back as the "actual" | ||
| 22 | * rate, rather than the rate at which data is sent. | ||
| 23 | * o To speed convergence to the right rate, we set the cwnd | ||
| 24 | * to achieve the right ("actual") rate when we exit slow start. | ||
| 25 | * o To filter out the noise caused by delayed ACKs, we use the | ||
| 26 | * minimum RTT sample observed during the last RTT to calculate | ||
| 27 | * the actual rate. | ||
| 28 | * o When the sender re-starts from idle, it waits until it has | ||
| 29 | * received ACKs for an entire flight of new data before making | ||
| 30 | * a cwnd adjustment decision. The original Vegas implementation | ||
| 31 | * assumed senders never went idle. | ||
| 32 | */ | ||
| 33 | |||
| 34 | #include <linux/config.h> | ||
| 35 | #include <linux/mm.h> | ||
| 36 | #include <linux/module.h> | ||
| 37 | #include <linux/skbuff.h> | ||
| 38 | #include <linux/tcp_diag.h> | ||
| 39 | |||
| 40 | #include <net/tcp.h> | ||
| 41 | |||
| 42 | /* Default values of the Vegas variables, in fixed-point representation | ||
| 43 | * with V_PARAM_SHIFT bits to the right of the binary point. | ||
| 44 | */ | ||
| 45 | #define V_PARAM_SHIFT 1 | ||
| 46 | static int alpha = 1<<V_PARAM_SHIFT; | ||
| 47 | static int beta = 3<<V_PARAM_SHIFT; | ||
| 48 | static int gamma = 1<<V_PARAM_SHIFT; | ||
| 49 | |||
| 50 | module_param(alpha, int, 0644); | ||
| 51 | MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)"); | ||
| 52 | module_param(beta, int, 0644); | ||
| 53 | MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)"); | ||
| 54 | module_param(gamma, int, 0644); | ||
| 55 | MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); | ||
| 56 | |||
| 57 | |||
| 58 | /* Vegas variables */ | ||
| 59 | struct vegas { | ||
| 60 | u32 beg_snd_nxt; /* right edge during last RTT */ | ||
| 61 | u32 beg_snd_una; /* left edge during last RTT */ | ||
| 62 | u32 beg_snd_cwnd; /* saves the size of the cwnd */ | ||
| 63 | u8 doing_vegas_now;/* if true, do vegas for this RTT */ | ||
| 64 | u16 cntRTT; /* # of RTTs measured within last RTT */ | ||
| 65 | u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ | ||
| 66 | u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ | ||
| 67 | }; | ||
| 68 | |||
| 69 | /* There are several situations when we must "re-start" Vegas: | ||
| 70 | * | ||
| 71 | * o when a connection is established | ||
| 72 | * o after an RTO | ||
| 73 | * o after fast recovery | ||
| 74 | * o when we send a packet and there is no outstanding | ||
| 75 | * unacknowledged data (restarting an idle connection) | ||
| 76 | * | ||
| 77 | * In these circumstances we cannot do a Vegas calculation at the | ||
| 78 | * end of the first RTT, because any calculation we do is using | ||
| 79 | * stale info -- both the saved cwnd and congestion feedback are | ||
| 80 | * stale. | ||
| 81 | * | ||
| 82 | * Instead we must wait until the completion of an RTT during | ||
| 83 | * which we actually receive ACKs. | ||
| 84 | */ | ||
| 85 | static inline void vegas_enable(struct tcp_sock *tp) | ||
| 86 | { | ||
| 87 | struct vegas *vegas = tcp_ca(tp); | ||
| 88 | |||
| 89 | /* Begin taking Vegas samples next time we send something. */ | ||
| 90 | vegas->doing_vegas_now = 1; | ||
| 91 | |||
| 92 | /* Set the beginning of the next send window. */ | ||
| 93 | vegas->beg_snd_nxt = tp->snd_nxt; | ||
| 94 | |||
| 95 | vegas->cntRTT = 0; | ||
| 96 | vegas->minRTT = 0x7fffffff; | ||
| 97 | } | ||
| 98 | |||
| 99 | /* Stop taking Vegas samples for now. */ | ||
| 100 | static inline void vegas_disable(struct tcp_sock *tp) | ||
| 101 | { | ||
| 102 | struct vegas *vegas = tcp_ca(tp); | ||
| 103 | |||
| 104 | vegas->doing_vegas_now = 0; | ||
| 105 | } | ||
| 106 | |||
| 107 | static void tcp_vegas_init(struct tcp_sock *tp) | ||
| 108 | { | ||
| 109 | struct vegas *vegas = tcp_ca(tp); | ||
| 110 | |||
| 111 | vegas->baseRTT = 0x7fffffff; | ||
| 112 | vegas_enable(tp); | ||
| 113 | } | ||
| 114 | |||
| 115 | /* Do RTT sampling needed for Vegas. | ||
| 116 | * Basically we: | ||
| 117 | * o min-filter RTT samples from within an RTT to get the current | ||
| 118 | * propagation delay + queuing delay (we are min-filtering to try to | ||
| 119 | * avoid the effects of delayed ACKs) | ||
| 120 | * o min-filter RTT samples from a much longer window (forever for now) | ||
| 121 | * to find the propagation delay (baseRTT) | ||
| 122 | */ | ||
| 123 | static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) | ||
| 124 | { | ||
| 125 | struct vegas *vegas = tcp_ca(tp); | ||
| 126 | u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ | ||
| 127 | |||
| 128 | /* Filter to find propagation delay: */ | ||
| 129 | if (vrtt < vegas->baseRTT) | ||
| 130 | vegas->baseRTT = vrtt; | ||
| 131 | |||
| 132 | /* Find the min RTT during the last RTT to find | ||
| 133 | * the current prop. delay + queuing delay: | ||
| 134 | */ | ||
| 135 | vegas->minRTT = min(vegas->minRTT, vrtt); | ||
| 136 | vegas->cntRTT++; | ||
| 137 | } | ||
| 138 | |||
| 139 | static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) | ||
| 140 | { | ||
| 141 | |||
| 142 | if (ca_state == TCP_CA_Open) | ||
| 143 | vegas_enable(tp); | ||
| 144 | else | ||
| 145 | vegas_disable(tp); | ||
| 146 | } | ||
| 147 | |||
| 148 | /* | ||
| 149 | * If the connection is idle and we are restarting, | ||
| 150 | * then we don't want to do any Vegas calculations | ||
| 151 | * until we get fresh RTT samples. So when we | ||
| 152 | * restart, we reset our Vegas state to a clean | ||
| 153 | * slate. After we get acks for this flight of | ||
| 154 | * packets, _then_ we can make Vegas calculations | ||
| 155 | * again. | ||
| 156 | */ | ||
| 157 | static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event) | ||
| 158 | { | ||
| 159 | if (event == CA_EVENT_CWND_RESTART || | ||
| 160 | event == CA_EVENT_TX_START) | ||
| 161 | tcp_vegas_init(tp); | ||
| 162 | } | ||
| 163 | |||
| 164 | static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, | ||
| 165 | u32 seq_rtt, u32 in_flight, int flag) | ||
| 166 | { | ||
| 167 | struct vegas *vegas = tcp_ca(tp); | ||
| 168 | |||
| 169 | if (!vegas->doing_vegas_now) | ||
| 170 | return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag); | ||
| 171 | |||
| 172 | /* The key players are v_beg_snd_una and v_beg_snd_nxt. | ||
| 173 | * | ||
| 174 | * These are so named because they represent the approximate values | ||
| 175 | * of snd_una and snd_nxt at the beginning of the current RTT. More | ||
| 176 | * precisely, they represent the amount of data sent during the RTT. | ||
| 177 | * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, | ||
| 178 | * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding | ||
| 179 | * bytes of data have been ACKed during the course of the RTT, giving | ||
| 180 | * an "actual" rate of: | ||
| 181 | * | ||
| 182 | * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) | ||
| 183 | * | ||
| 184 | * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, | ||
| 185 | * because delayed ACKs can cover more than one segment, so they | ||
| 186 | * don't line up nicely with the boundaries of RTTs. | ||
| 187 | * | ||
| 188 | * Another unfortunate fact of life is that delayed ACKs delay the | ||
| 189 | * advance of the left edge of our send window, so that the number | ||
| 190 | * of bytes we send in an RTT is often less than our cwnd will allow. | ||
| 191 | * So we keep track of our cwnd separately, in v_beg_snd_cwnd. | ||
| 192 | */ | ||
| 193 | |||
| 194 | if (after(ack, vegas->beg_snd_nxt)) { | ||
| 195 | /* Do the Vegas once-per-RTT cwnd adjustment. */ | ||
| 196 | u32 old_wnd, old_snd_cwnd; | ||
| 197 | |||
| 198 | |||
| 199 | /* Here old_wnd is essentially the window of data that was | ||
| 200 | * sent during the previous RTT, and has all | ||
| 201 | * been acknowledged in the course of the RTT that ended | ||
| 202 | * with the ACK we just received. Likewise, old_snd_cwnd | ||
| 203 | * is the cwnd during the previous RTT. | ||
| 204 | */ | ||
| 205 | old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / | ||
| 206 | tp->mss_cache; | ||
| 207 | old_snd_cwnd = vegas->beg_snd_cwnd; | ||
| 208 | |||
| 209 | /* Save the extent of the current window so we can use this | ||
| 210 | * at the end of the next RTT. | ||
| 211 | */ | ||
| 212 | vegas->beg_snd_una = vegas->beg_snd_nxt; | ||
| 213 | vegas->beg_snd_nxt = tp->snd_nxt; | ||
| 214 | vegas->beg_snd_cwnd = tp->snd_cwnd; | ||
| 215 | |||
| 216 | /* Take into account the current RTT sample too, to | ||
| 217 | * decrease the impact of delayed acks. This double counts | ||
| 218 | * this sample since we count it for the next window as well, | ||
| 219 | * but that's not too awful, since we're taking the min, | ||
| 220 | * rather than averaging. | ||
| 221 | */ | ||
| 222 | tcp_vegas_rtt_calc(tp, seq_rtt*1000); | ||
| 223 | |||
| 224 | /* We do the Vegas calculations only if we got enough RTT | ||
| 225 | * samples that we can be reasonably sure that we got | ||
| 226 | * at least one RTT sample that wasn't from a delayed ACK. | ||
| 227 | * If we only had 2 samples total, | ||
| 228 | * then that means we're getting only 1 ACK per RTT, which | ||
| 229 | * means they're almost certainly delayed ACKs. | ||
| 230 | * If we have 3 samples, we should be OK. | ||
| 231 | */ | ||
| 232 | |||
| 233 | if (vegas->cntRTT <= 2) { | ||
| 234 | /* We don't have enough RTT samples to do the Vegas | ||
| 235 | * calculation, so we'll behave like Reno. | ||
| 236 | */ | ||
| 237 | if (tp->snd_cwnd > tp->snd_ssthresh) | ||
| 238 | tp->snd_cwnd++; | ||
| 239 | } else { | ||
| 240 | u32 rtt, target_cwnd, diff; | ||
| 241 | |||
| 242 | /* We have enough RTT samples, so, using the Vegas | ||
| 243 | * algorithm, we determine if we should increase or | ||
| 244 | * decrease cwnd, and by how much. | ||
| 245 | */ | ||
| 246 | |||
| 247 | /* Pluck out the RTT we are using for the Vegas | ||
| 248 | * calculations. This is the min RTT seen during the | ||
| 249 | * last RTT. Taking the min filters out the effects | ||
| 250 | * of delayed ACKs, at the cost of noticing congestion | ||
| 251 | * a bit later. | ||
| 252 | */ | ||
| 253 | rtt = vegas->minRTT; | ||
| 254 | |||
| 255 | /* Calculate the cwnd we should have, if we weren't | ||
| 256 | * going too fast. | ||
| 257 | * | ||
| 258 | * This is: | ||
| 259 | * (actual rate in segments) * baseRTT | ||
| 260 | * We keep it as a fixed point number with | ||
| 261 | * V_PARAM_SHIFT bits to the right of the binary point. | ||
| 262 | */ | ||
| 263 | target_cwnd = ((old_wnd * vegas->baseRTT) | ||
| 264 | << V_PARAM_SHIFT) / rtt; | ||
| 265 | |||
| 266 | /* Calculate the difference between the window we had, | ||
| 267 | * and the window we would like to have. This quantity | ||
| 268 | * is the "Diff" from the Arizona Vegas papers. | ||
| 269 | * | ||
| 270 | * Again, this is a fixed point number with | ||
| 271 | * V_PARAM_SHIFT bits to the right of the binary | ||
| 272 | * point. | ||
| 273 | */ | ||
| 274 | diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; | ||
| 275 | |||
| 276 | if (tp->snd_cwnd < tp->snd_ssthresh) { | ||
| 277 | /* Slow start. */ | ||
| 278 | if (diff > gamma) { | ||
| 279 | /* Going too fast. Time to slow down | ||
| 280 | * and switch to congestion avoidance. | ||
| 281 | */ | ||
| 282 | tp->snd_ssthresh = 2; | ||
| 283 | |||
| 284 | /* Set cwnd to match the actual rate | ||
| 285 | * exactly: | ||
| 286 | * cwnd = (actual rate) * baseRTT | ||
| 287 | * Then we add 1 because the integer | ||
| 288 | * truncation robs us of full link | ||
| 289 | * utilization. | ||
| 290 | */ | ||
| 291 | tp->snd_cwnd = min(tp->snd_cwnd, | ||
| 292 | (target_cwnd >> | ||
| 293 | V_PARAM_SHIFT)+1); | ||
| 294 | |||
| 295 | } | ||
| 296 | } else { | ||
| 297 | /* Congestion avoidance. */ | ||
| 298 | u32 next_snd_cwnd; | ||
| 299 | |||
| 300 | /* Figure out where we would like cwnd | ||
| 301 | * to be. | ||
| 302 | */ | ||
| 303 | if (diff > beta) { | ||
| 304 | /* The old window was too fast, so | ||
| 305 | * we slow down. | ||
| 306 | */ | ||
| 307 | next_snd_cwnd = old_snd_cwnd - 1; | ||
| 308 | } else if (diff < alpha) { | ||
| 309 | /* We don't have enough extra packets | ||
| 310 | * in the network, so speed up. | ||
| 311 | */ | ||
| 312 | next_snd_cwnd = old_snd_cwnd + 1; | ||
| 313 | } else { | ||
| 314 | /* Sending just as fast as we | ||
| 315 | * should be. | ||
| 316 | */ | ||
| 317 | next_snd_cwnd = old_snd_cwnd; | ||
| 318 | } | ||
| 319 | |||
| 320 | /* Adjust cwnd upward or downward, toward the | ||
| 321 | * desired value. | ||
| 322 | */ | ||
| 323 | if (next_snd_cwnd > tp->snd_cwnd) | ||
| 324 | tp->snd_cwnd++; | ||
| 325 | else if (next_snd_cwnd < tp->snd_cwnd) | ||
| 326 | tp->snd_cwnd--; | ||
| 327 | } | ||
| 328 | } | ||
| 329 | |||
| 330 | /* Wipe the slate clean for the next RTT. */ | ||
| 331 | vegas->cntRTT = 0; | ||
| 332 | vegas->minRTT = 0x7fffffff; | ||
| 333 | } | ||
| 334 | |||
| 335 | /* The following code is executed for every ack we receive, | ||
| 336 | * except for conditions checked in should_advance_cwnd() | ||
| 337 | * before the call to tcp_cong_avoid(). Mainly this means that | ||
| 338 | * we only execute this code if the ack actually acked some | ||
| 339 | * data. | ||
| 340 | */ | ||
| 341 | |||
| 342 | /* If we are in slow start, increase our cwnd in response to this ACK. | ||
| 343 | * (If we are not in slow start then we are in congestion avoidance, | ||
| 344 | * and adjust our congestion window only once per RTT. See the code | ||
| 345 | * above.) | ||
| 346 | */ | ||
| 347 | if (tp->snd_cwnd <= tp->snd_ssthresh) | ||
| 348 | tp->snd_cwnd++; | ||
| 349 | |||
| 350 | /* to keep cwnd from growing without bound */ | ||
| 351 | tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); | ||
| 352 | |||
| 353 | /* Make sure that we are never so timid as to reduce our cwnd below | ||
| 354 | * 2 MSS. | ||
| 355 | * | ||
| 356 | * Going below 2 MSS would risk huge delayed ACKs from our receiver. | ||
| 357 | */ | ||
| 358 | tp->snd_cwnd = max(tp->snd_cwnd, 2U); | ||
| 359 | } | ||
| 360 | |||
| 361 | /* Extract info for Tcp socket info provided via netlink. */ | ||
| 362 | static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext, | ||
| 363 | struct sk_buff *skb) | ||
| 364 | { | ||
| 365 | const struct vegas *ca = tcp_ca(tp); | ||
| 366 | if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { | ||
| 367 | struct tcpvegas_info *info; | ||
| 368 | |||
| 369 | info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO, | ||
| 370 | sizeof(*info))); | ||
| 371 | |||
| 372 | info->tcpv_enabled = ca->doing_vegas_now; | ||
| 373 | info->tcpv_rttcnt = ca->cntRTT; | ||
| 374 | info->tcpv_rtt = ca->baseRTT; | ||
| 375 | info->tcpv_minrtt = ca->minRTT; | ||
| 376 | rtattr_failure: ; | ||
| 377 | } | ||
| 378 | } | ||
| 379 | |||
| 380 | static struct tcp_congestion_ops tcp_vegas = { | ||
| 381 | .init = tcp_vegas_init, | ||
| 382 | .ssthresh = tcp_reno_ssthresh, | ||
| 383 | .cong_avoid = tcp_vegas_cong_avoid, | ||
| 384 | .min_cwnd = tcp_reno_min_cwnd, | ||
| 385 | .rtt_sample = tcp_vegas_rtt_calc, | ||
| 386 | .set_state = tcp_vegas_state, | ||
| 387 | .cwnd_event = tcp_vegas_cwnd_event, | ||
| 388 | .get_info = tcp_vegas_get_info, | ||
| 389 | |||
| 390 | .owner = THIS_MODULE, | ||
| 391 | .name = "vegas", | ||
| 392 | }; | ||
| 393 | |||
| 394 | static int __init tcp_vegas_register(void) | ||
| 395 | { | ||
| 396 | BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE); | ||
| 397 | tcp_register_congestion_control(&tcp_vegas); | ||
| 398 | return 0; | ||
| 399 | } | ||
| 400 | |||
| 401 | static void __exit tcp_vegas_unregister(void) | ||
| 402 | { | ||
| 403 | tcp_unregister_congestion_control(&tcp_vegas); | ||
| 404 | } | ||
| 405 | |||
| 406 | module_init(tcp_vegas_register); | ||
| 407 | module_exit(tcp_vegas_unregister); | ||
| 408 | |||
| 409 | MODULE_AUTHOR("Stephen Hemminger"); | ||
| 410 | MODULE_LICENSE("GPL"); | ||
| 411 | MODULE_DESCRIPTION("TCP Vegas"); | ||
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c new file mode 100644 index 000000000000..ef827242c940 --- /dev/null +++ b/net/ipv4/tcp_westwood.c | |||
| @@ -0,0 +1,259 @@ | |||
| 1 | /* | ||
| 2 | * TCP Westwood+ | ||
| 3 | * | ||
| 4 | * Angelo Dell'Aera: TCP Westwood+ support | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include <linux/config.h> | ||
| 8 | #include <linux/mm.h> | ||
| 9 | #include <linux/module.h> | ||
| 10 | #include <linux/skbuff.h> | ||
| 11 | #include <linux/tcp_diag.h> | ||
| 12 | #include <net/tcp.h> | ||
| 13 | |||
| 14 | /* TCP Westwood structure */ | ||
| 15 | struct westwood { | ||
| 16 | u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ | ||
| 17 | u32 bw_est; /* bandwidth estimate */ | ||
| 18 | u32 rtt_win_sx; /* here starts a new evaluation... */ | ||
| 19 | u32 bk; | ||
| 20 | u32 snd_una; /* used for evaluating the number of acked bytes */ | ||
| 21 | u32 cumul_ack; | ||
| 22 | u32 accounted; | ||
| 23 | u32 rtt; | ||
| 24 | u32 rtt_min; /* minimum observed RTT */ | ||
| 25 | }; | ||
| 26 | |||
| 27 | |||
| 28 | /* TCP Westwood functions and constants */ | ||
| 29 | #define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ | ||
| 30 | #define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ | ||
| 31 | |||
| 32 | /* | ||
| 33 | * @tcp_westwood_create | ||
| 34 | * This function initializes fields used in TCP Westwood+, | ||
| 35 | * it is called after the initial SYN, so the sequence numbers | ||
| 36 | * are correct but new passive connections we have no | ||
| 37 | * information about RTTmin at this time so we simply set it to | ||
| 38 | * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative | ||
| 39 | * since in this way we're sure it will be updated in a consistent | ||
| 40 | * way as soon as possible. It will reasonably happen within the first | ||
| 41 | * RTT period of the connection lifetime. | ||
| 42 | */ | ||
| 43 | static void tcp_westwood_init(struct tcp_sock *tp) | ||
| 44 | { | ||
| 45 | struct westwood *w = tcp_ca(tp); | ||
| 46 | |||
| 47 | w->bk = 0; | ||
| 48 | w->bw_ns_est = 0; | ||
| 49 | w->bw_est = 0; | ||
| 50 | w->accounted = 0; | ||
| 51 | w->cumul_ack = 0; | ||
| 52 | w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; | ||
| 53 | w->rtt_win_sx = tcp_time_stamp; | ||
| 54 | w->snd_una = tp->snd_una; | ||
| 55 | } | ||
| 56 | |||
| 57 | /* | ||
| 58 | * @westwood_do_filter | ||
| 59 | * Low-pass filter. Implemented using constant coefficients. | ||
| 60 | */ | ||
| 61 | static inline u32 westwood_do_filter(u32 a, u32 b) | ||
| 62 | { | ||
| 63 | return (((7 * a) + b) >> 3); | ||
| 64 | } | ||
| 65 | |||
| 66 | static inline void westwood_filter(struct westwood *w, u32 delta) | ||
| 67 | { | ||
| 68 | w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); | ||
| 69 | w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); | ||
| 70 | } | ||
| 71 | |||
| 72 | /* | ||
| 73 | * @westwood_pkts_acked | ||
| 74 | * Called after processing group of packets. | ||
| 75 | * but all westwood needs is the last sample of srtt. | ||
| 76 | */ | ||
| 77 | static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) | ||
| 78 | { | ||
| 79 | struct westwood *w = tcp_ca(tp); | ||
| 80 | if (cnt > 0) | ||
| 81 | w->rtt = tp->srtt >> 3; | ||
| 82 | } | ||
| 83 | |||
| 84 | /* | ||
| 85 | * @westwood_update_window | ||
| 86 | * It updates RTT evaluation window if it is the right moment to do | ||
| 87 | * it. If so it calls filter for evaluating bandwidth. | ||
| 88 | */ | ||
| 89 | static void westwood_update_window(struct tcp_sock *tp) | ||
| 90 | { | ||
| 91 | struct westwood *w = tcp_ca(tp); | ||
| 92 | s32 delta = tcp_time_stamp - w->rtt_win_sx; | ||
| 93 | |||
| 94 | /* | ||
| 95 | * See if a RTT-window has passed. | ||
| 96 | * Be careful since if RTT is less than | ||
| 97 | * 50ms we don't filter but we continue 'building the sample'. | ||
| 98 | * This minimum limit was chosen since an estimation on small | ||
| 99 | * time intervals is better to avoid... | ||
| 100 | * Obviously on a LAN we reasonably will always have | ||
| 101 | * right_bound = left_bound + WESTWOOD_RTT_MIN | ||
| 102 | */ | ||
| 103 | if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) { | ||
| 104 | westwood_filter(w, delta); | ||
| 105 | |||
| 106 | w->bk = 0; | ||
| 107 | w->rtt_win_sx = tcp_time_stamp; | ||
| 108 | } | ||
| 109 | } | ||
| 110 | |||
| 111 | /* | ||
| 112 | * @westwood_fast_bw | ||
| 113 | * It is called when we are in fast path. In particular it is called when | ||
| 114 | * header prediction is successful. In such case in fact update is | ||
| 115 | * straight forward and doesn't need any particular care. | ||
| 116 | */ | ||
| 117 | static inline void westwood_fast_bw(struct tcp_sock *tp) | ||
| 118 | { | ||
| 119 | struct westwood *w = tcp_ca(tp); | ||
| 120 | |||
| 121 | westwood_update_window(tp); | ||
| 122 | |||
| 123 | w->bk += tp->snd_una - w->snd_una; | ||
| 124 | w->snd_una = tp->snd_una; | ||
| 125 | w->rtt_min = min(w->rtt, w->rtt_min); | ||
| 126 | } | ||
| 127 | |||
| 128 | /* | ||
| 129 | * @westwood_acked_count | ||
| 130 | * This function evaluates cumul_ack for evaluating bk in case of | ||
| 131 | * delayed or partial acks. | ||
| 132 | */ | ||
| 133 | static inline u32 westwood_acked_count(struct tcp_sock *tp) | ||
| 134 | { | ||
| 135 | struct westwood *w = tcp_ca(tp); | ||
| 136 | |||
| 137 | w->cumul_ack = tp->snd_una - w->snd_una; | ||
| 138 | |||
| 139 | /* If cumul_ack is 0 this is a dupack since it's not moving | ||
| 140 | * tp->snd_una. | ||
| 141 | */ | ||
| 142 | if (!w->cumul_ack) { | ||
| 143 | w->accounted += tp->mss_cache; | ||
| 144 | w->cumul_ack = tp->mss_cache; | ||
| 145 | } | ||
| 146 | |||
| 147 | if (w->cumul_ack > tp->mss_cache) { | ||
| 148 | /* Partial or delayed ack */ | ||
| 149 | if (w->accounted >= w->cumul_ack) { | ||
| 150 | w->accounted -= w->cumul_ack; | ||
| 151 | w->cumul_ack = tp->mss_cache; | ||
| 152 | } else { | ||
| 153 | w->cumul_ack -= w->accounted; | ||
| 154 | w->accounted = 0; | ||
| 155 | } | ||
| 156 | } | ||
| 157 | |||
| 158 | w->snd_una = tp->snd_una; | ||
| 159 | |||
| 160 | return w->cumul_ack; | ||
| 161 | } | ||
| 162 | |||
| 163 | static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) | ||
| 164 | { | ||
| 165 | struct westwood *w = tcp_ca(tp); | ||
| 166 | return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); | ||
| 167 | } | ||
| 168 | |||
| 169 | /* | ||
| 170 | * TCP Westwood | ||
| 171 | * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it | ||
| 172 | * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 | ||
| 173 | * so avoids ever returning 0. | ||
| 174 | */ | ||
| 175 | static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp) | ||
| 176 | { | ||
| 177 | return westwood_bw_rttmin(tp); | ||
| 178 | } | ||
| 179 | |||
| 180 | static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) | ||
| 181 | { | ||
| 182 | struct westwood *w = tcp_ca(tp); | ||
| 183 | |||
| 184 | switch(event) { | ||
| 185 | case CA_EVENT_FAST_ACK: | ||
| 186 | westwood_fast_bw(tp); | ||
| 187 | break; | ||
| 188 | |||
| 189 | case CA_EVENT_COMPLETE_CWR: | ||
| 190 | tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp); | ||
| 191 | break; | ||
| 192 | |||
| 193 | case CA_EVENT_FRTO: | ||
| 194 | tp->snd_ssthresh = westwood_bw_rttmin(tp); | ||
| 195 | break; | ||
| 196 | |||
| 197 | case CA_EVENT_SLOW_ACK: | ||
| 198 | westwood_update_window(tp); | ||
| 199 | w->bk += westwood_acked_count(tp); | ||
| 200 | w->rtt_min = min(w->rtt, w->rtt_min); | ||
| 201 | break; | ||
| 202 | |||
| 203 | default: | ||
| 204 | /* don't care */ | ||
| 205 | break; | ||
| 206 | } | ||
| 207 | } | ||
| 208 | |||
| 209 | |||
| 210 | /* Extract info for Tcp socket info provided via netlink. */ | ||
| 211 | static void tcp_westwood_info(struct tcp_sock *tp, u32 ext, | ||
| 212 | struct sk_buff *skb) | ||
| 213 | { | ||
| 214 | const struct westwood *ca = tcp_ca(tp); | ||
| 215 | if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { | ||
| 216 | struct rtattr *rta; | ||
| 217 | struct tcpvegas_info *info; | ||
| 218 | |||
| 219 | rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info)); | ||
| 220 | info = RTA_DATA(rta); | ||
| 221 | info->tcpv_enabled = 1; | ||
| 222 | info->tcpv_rttcnt = 0; | ||
| 223 | info->tcpv_rtt = jiffies_to_usecs(ca->rtt); | ||
| 224 | info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min); | ||
| 225 | rtattr_failure: ; | ||
| 226 | } | ||
| 227 | } | ||
| 228 | |||
| 229 | |||
| 230 | static struct tcp_congestion_ops tcp_westwood = { | ||
| 231 | .init = tcp_westwood_init, | ||
| 232 | .ssthresh = tcp_reno_ssthresh, | ||
| 233 | .cong_avoid = tcp_reno_cong_avoid, | ||
| 234 | .min_cwnd = tcp_westwood_cwnd_min, | ||
| 235 | .cwnd_event = tcp_westwood_event, | ||
| 236 | .get_info = tcp_westwood_info, | ||
| 237 | .pkts_acked = tcp_westwood_pkts_acked, | ||
| 238 | |||
| 239 | .owner = THIS_MODULE, | ||
| 240 | .name = "westwood" | ||
| 241 | }; | ||
| 242 | |||
| 243 | static int __init tcp_westwood_register(void) | ||
| 244 | { | ||
| 245 | BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE); | ||
| 246 | return tcp_register_congestion_control(&tcp_westwood); | ||
| 247 | } | ||
| 248 | |||
| 249 | static void __exit tcp_westwood_unregister(void) | ||
| 250 | { | ||
| 251 | tcp_unregister_congestion_control(&tcp_westwood); | ||
| 252 | } | ||
| 253 | |||
| 254 | module_init(tcp_westwood_register); | ||
| 255 | module_exit(tcp_westwood_unregister); | ||
| 256 | |||
| 257 | MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera"); | ||
| 258 | MODULE_LICENSE("GPL"); | ||
| 259 | MODULE_DESCRIPTION("TCP Westwood+"); | ||
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2414937f2a83..fce56039b0e9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
| @@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk) | |||
| 2025 | sk->sk_state = TCP_CLOSE; | 2025 | sk->sk_state = TCP_CLOSE; |
| 2026 | 2026 | ||
| 2027 | tp->af_specific = &ipv6_specific; | 2027 | tp->af_specific = &ipv6_specific; |
| 2028 | 2028 | tp->ca_ops = &tcp_reno; | |
| 2029 | sk->sk_write_space = sk_stream_write_space; | 2029 | sk->sk_write_space = sk_stream_write_space; |
| 2030 | sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); | 2030 | sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); |
| 2031 | 2031 | ||
