aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorStephen Hemminger <shemminger@osdl.org>2005-06-23 15:19:55 -0400
committerDavid S. Miller <davem@davemloft.net>2005-06-23 15:19:55 -0400
commit317a76f9a44b437d6301718f4e5d08bd93f98da7 (patch)
treecaeba9839dee264f59b035b81c3d13d6c61b638e /include
parenta8ad86f2dc46356f87be1327dabc18bdbda32f50 (diff)
[TCP]: Add pluggable congestion control algorithm infrastructure.
Allow TCP to have multiple pluggable congestion control algorithms. Algorithms are defined by a set of operations and can be built in or modules. The legacy "new RENO" algorithm is used as a starting point and fallback. Signed-off-by: Stephen Hemminger <shemminger@osdl.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include')
-rw-r--r--include/linux/sysctl.h9
-rw-r--r--include/linux/tcp.h49
-rw-r--r--include/net/tcp.h237
3 files changed, 86 insertions, 209 deletions
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 614e939c78a4..72965bfe6cfb 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -333,21 +333,14 @@ enum
333 NET_TCP_FRTO=92, 333 NET_TCP_FRTO=92,
334 NET_TCP_LOW_LATENCY=93, 334 NET_TCP_LOW_LATENCY=93,
335 NET_IPV4_IPFRAG_SECRET_INTERVAL=94, 335 NET_IPV4_IPFRAG_SECRET_INTERVAL=94,
336 NET_TCP_WESTWOOD=95,
337 NET_IPV4_IGMP_MAX_MSF=96, 336 NET_IPV4_IGMP_MAX_MSF=96,
338 NET_TCP_NO_METRICS_SAVE=97, 337 NET_TCP_NO_METRICS_SAVE=97,
339 NET_TCP_VEGAS=98,
340 NET_TCP_VEGAS_ALPHA=99,
341 NET_TCP_VEGAS_BETA=100,
342 NET_TCP_VEGAS_GAMMA=101,
343 NET_TCP_BIC=102,
344 NET_TCP_BIC_FAST_CONVERGENCE=103,
345 NET_TCP_BIC_LOW_WINDOW=104,
346 NET_TCP_DEFAULT_WIN_SCALE=105, 338 NET_TCP_DEFAULT_WIN_SCALE=105,
347 NET_TCP_MODERATE_RCVBUF=106, 339 NET_TCP_MODERATE_RCVBUF=106,
348 NET_TCP_TSO_WIN_DIVISOR=107, 340 NET_TCP_TSO_WIN_DIVISOR=107,
349 NET_TCP_BIC_BETA=108, 341 NET_TCP_BIC_BETA=108,
350 NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, 342 NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
343 NET_TCP_CONG_CONTROL=110,
351}; 344};
352 345
353enum { 346enum {
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 97a7c9e03df5..3ea75dd6640a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -203,13 +203,6 @@ struct tcp_sack_block {
203 __u32 end_seq; 203 __u32 end_seq;
204}; 204};
205 205
206enum tcp_congestion_algo {
207 TCP_RENO=0,
208 TCP_VEGAS,
209 TCP_WESTWOOD,
210 TCP_BIC,
211};
212
213struct tcp_options_received { 206struct tcp_options_received {
214/* PAWS/RTTM data */ 207/* PAWS/RTTM data */
215 long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ 208 long ts_recent_stamp;/* Time we stored ts_recent (for aging) */
@@ -305,7 +298,7 @@ struct tcp_sock {
305 __u8 reordering; /* Packet reordering metric. */ 298 __u8 reordering; /* Packet reordering metric. */
306 __u8 frto_counter; /* Number of new acks after RTO */ 299 __u8 frto_counter; /* Number of new acks after RTO */
307 300
308 __u8 adv_cong; /* Using Vegas, Westwood, or BIC */ 301 __u8 unused;
309 __u8 defer_accept; /* User waits for some data after accept() */ 302 __u8 defer_accept; /* User waits for some data after accept() */
310 303
311/* RTT measurement */ 304/* RTT measurement */
@@ -401,37 +394,10 @@ struct tcp_sock {
401 __u32 time; 394 __u32 time;
402 } rcvq_space; 395 } rcvq_space;
403 396
404/* TCP Westwood structure */ 397 /* Pluggable TCP congestion control hook */
405 struct { 398 struct tcp_congestion_ops *ca_ops;
406 __u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ 399 u32 ca_priv[16];
407 __u32 bw_est; /* bandwidth estimate */ 400#define TCP_CA_PRIV_SIZE (16*sizeof(u32))
408 __u32 rtt_win_sx; /* here starts a new evaluation... */
409 __u32 bk;
410 __u32 snd_una; /* used for evaluating the number of acked bytes */
411 __u32 cumul_ack;
412 __u32 accounted;
413 __u32 rtt;
414 __u32 rtt_min; /* minimum observed RTT */
415 } westwood;
416
417/* Vegas variables */
418 struct {
419 __u32 beg_snd_nxt; /* right edge during last RTT */
420 __u32 beg_snd_una; /* left edge during last RTT */
421 __u32 beg_snd_cwnd; /* saves the size of the cwnd */
422 __u8 doing_vegas_now;/* if true, do vegas for this RTT */
423 __u16 cntRTT; /* # of RTTs measured within last RTT */
424 __u32 minRTT; /* min of RTTs measured within last RTT (in usec) */
425 __u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */
426 } vegas;
427
428 /* BI TCP Parameters */
429 struct {
430 __u32 cnt; /* increase cwnd by 1 after this number of ACKs */
431 __u32 last_max_cwnd; /* last maximium snd_cwnd */
432 __u32 last_cwnd; /* the last snd_cwnd */
433 __u32 last_stamp; /* time when updated last_cwnd */
434 } bictcp;
435}; 401};
436 402
437static inline struct tcp_sock *tcp_sk(const struct sock *sk) 403static inline struct tcp_sock *tcp_sk(const struct sock *sk)
@@ -439,6 +405,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
439 return (struct tcp_sock *)sk; 405 return (struct tcp_sock *)sk;
440} 406}
441 407
408static inline void *tcp_ca(const struct tcp_sock *tp)
409{
410 return (void *) tp->ca_priv;
411}
412
442#endif 413#endif
443 414
444#endif /* _LINUX_TCP_H */ 415#endif /* _LINUX_TCP_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f730935b824a..e427cf35915c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -505,25 +505,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
505#else 505#else
506# define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG) 506# define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG)
507#endif 507#endif
508
509#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
510 * max_cwnd = snd_cwnd * beta
511 */
512#define BICTCP_MAX_INCREMENT 32 /*
513 * Limit on the amount of
514 * increment allowed during
515 * binary search.
516 */
517#define BICTCP_FUNC_OF_MIN_INCR 11 /*
518 * log(B/Smin)/log(B/(B-1))+1,
519 * Smin:min increment
520 * B:log factor
521 */
522#define BICTCP_B 4 /*
523 * In binary search,
524 * go to point (max+min)/N
525 */
526
527/* 508/*
528 * TCP option 509 * TCP option
529 */ 510 */
@@ -596,16 +577,7 @@ extern int sysctl_tcp_adv_win_scale;
596extern int sysctl_tcp_tw_reuse; 577extern int sysctl_tcp_tw_reuse;
597extern int sysctl_tcp_frto; 578extern int sysctl_tcp_frto;
598extern int sysctl_tcp_low_latency; 579extern int sysctl_tcp_low_latency;
599extern int sysctl_tcp_westwood;
600extern int sysctl_tcp_vegas_cong_avoid;
601extern int sysctl_tcp_vegas_alpha;
602extern int sysctl_tcp_vegas_beta;
603extern int sysctl_tcp_vegas_gamma;
604extern int sysctl_tcp_nometrics_save; 580extern int sysctl_tcp_nometrics_save;
605extern int sysctl_tcp_bic;
606extern int sysctl_tcp_bic_fast_convergence;
607extern int sysctl_tcp_bic_low_window;
608extern int sysctl_tcp_bic_beta;
609extern int sysctl_tcp_moderate_rcvbuf; 581extern int sysctl_tcp_moderate_rcvbuf;
610extern int sysctl_tcp_tso_win_divisor; 582extern int sysctl_tcp_tso_win_divisor;
611 583
@@ -1136,6 +1108,80 @@ static inline void tcp_packets_out_dec(struct tcp_sock *tp,
1136 tp->packets_out -= tcp_skb_pcount(skb); 1108 tp->packets_out -= tcp_skb_pcount(skb);
1137} 1109}
1138 1110
1111/* Events passed to congestion control interface */
1112enum tcp_ca_event {
1113 CA_EVENT_TX_START, /* first transmit when no packets in flight */
1114 CA_EVENT_CWND_RESTART, /* congestion window restart */
1115 CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
1116 CA_EVENT_FRTO, /* fast recovery timeout */
1117 CA_EVENT_LOSS, /* loss timeout */
1118 CA_EVENT_FAST_ACK, /* in sequence ack */
1119 CA_EVENT_SLOW_ACK, /* other ack */
1120};
1121
1122/*
1123 * Interface for adding new TCP congestion control handlers
1124 */
1125#define TCP_CA_NAME_MAX 16
1126struct tcp_congestion_ops {
1127 struct list_head list;
1128
1129 /* initialize private data (optional) */
1130 void (*init)(struct tcp_sock *tp);
1131 /* cleanup private data (optional) */
1132 void (*release)(struct tcp_sock *tp);
1133
1134 /* return slow start threshold (required) */
1135 u32 (*ssthresh)(struct tcp_sock *tp);
1136 /* lower bound for congestion window (optional) */
1137 u32 (*min_cwnd)(struct tcp_sock *tp);
1138 /* do new cwnd calculation (required) */
1139 void (*cong_avoid)(struct tcp_sock *tp, u32 ack,
1140 u32 rtt, u32 in_flight, int good_ack);
1141 /* round trip time sample per acked packet (optional) */
1142 void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt);
1143 /* call before changing ca_state (optional) */
1144 void (*set_state)(struct tcp_sock *tp, u8 new_state);
1145 /* call when cwnd event occurs (optional) */
1146 void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev);
1147 /* new value of cwnd after loss (optional) */
1148 u32 (*undo_cwnd)(struct tcp_sock *tp);
1149 /* hook for packet ack accounting (optional) */
1150 void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked);
1151 /* get info for tcp_diag (optional) */
1152 void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb);
1153
1154 char name[TCP_CA_NAME_MAX];
1155 struct module *owner;
1156};
1157
1158extern int tcp_register_congestion_control(struct tcp_congestion_ops *type);
1159extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
1160
1161extern void tcp_init_congestion_control(struct tcp_sock *tp);
1162extern void tcp_cleanup_congestion_control(struct tcp_sock *tp);
1163extern int tcp_set_default_congestion_control(const char *name);
1164extern void tcp_get_default_congestion_control(char *name);
1165
1166extern struct tcp_congestion_ops tcp_reno;
1167extern u32 tcp_reno_ssthresh(struct tcp_sock *tp);
1168extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack,
1169 u32 rtt, u32 in_flight, int flag);
1170extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp);
1171
1172static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state)
1173{
1174 if (tp->ca_ops->set_state)
1175 tp->ca_ops->set_state(tp, ca_state);
1176 tp->ca_state = ca_state;
1177}
1178
1179static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event)
1180{
1181 if (tp->ca_ops->cwnd_event)
1182 tp->ca_ops->cwnd_event(tp, event);
1183}
1184
1139/* This determines how many packets are "in the network" to the best 1185/* This determines how many packets are "in the network" to the best
1140 * of our knowledge. In many cases it is conservative, but where 1186 * of our knowledge. In many cases it is conservative, but where
1141 * detailed information is available from the receiver (via SACK 1187 * detailed information is available from the receiver (via SACK
@@ -1155,91 +1201,6 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
1155 return (tp->packets_out - tp->left_out + tp->retrans_out); 1201 return (tp->packets_out - tp->left_out + tp->retrans_out);
1156} 1202}
1157 1203
1158/*
1159 * Which congestion algorithim is in use on the connection.
1160 */
1161#define tcp_is_vegas(__tp) ((__tp)->adv_cong == TCP_VEGAS)
1162#define tcp_is_westwood(__tp) ((__tp)->adv_cong == TCP_WESTWOOD)
1163#define tcp_is_bic(__tp) ((__tp)->adv_cong == TCP_BIC)
1164
1165/* Recalculate snd_ssthresh, we want to set it to:
1166 *
1167 * Reno:
1168 * one half the current congestion window, but no
1169 * less than two segments
1170 *
1171 * BIC:
1172 * behave like Reno until low_window is reached,
1173 * then increase congestion window slowly
1174 */
1175static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp)
1176{
1177 if (tcp_is_bic(tp)) {
1178 if (sysctl_tcp_bic_fast_convergence &&
1179 tp->snd_cwnd < tp->bictcp.last_max_cwnd)
1180 tp->bictcp.last_max_cwnd = (tp->snd_cwnd *
1181 (BICTCP_BETA_SCALE
1182 + sysctl_tcp_bic_beta))
1183 / (2 * BICTCP_BETA_SCALE);
1184 else
1185 tp->bictcp.last_max_cwnd = tp->snd_cwnd;
1186
1187 if (tp->snd_cwnd > sysctl_tcp_bic_low_window)
1188 return max((tp->snd_cwnd * sysctl_tcp_bic_beta)
1189 / BICTCP_BETA_SCALE, 2U);
1190 }
1191
1192 return max(tp->snd_cwnd >> 1U, 2U);
1193}
1194
1195/* Stop taking Vegas samples for now. */
1196#define tcp_vegas_disable(__tp) ((__tp)->vegas.doing_vegas_now = 0)
1197
1198static inline void tcp_vegas_enable(struct tcp_sock *tp)
1199{
1200 /* There are several situations when we must "re-start" Vegas:
1201 *
1202 * o when a connection is established
1203 * o after an RTO
1204 * o after fast recovery
1205 * o when we send a packet and there is no outstanding
1206 * unacknowledged data (restarting an idle connection)
1207 *
1208 * In these circumstances we cannot do a Vegas calculation at the
1209 * end of the first RTT, because any calculation we do is using
1210 * stale info -- both the saved cwnd and congestion feedback are
1211 * stale.
1212 *
1213 * Instead we must wait until the completion of an RTT during
1214 * which we actually receive ACKs.
1215 */
1216
1217 /* Begin taking Vegas samples next time we send something. */
1218 tp->vegas.doing_vegas_now = 1;
1219
1220 /* Set the beginning of the next send window. */
1221 tp->vegas.beg_snd_nxt = tp->snd_nxt;
1222
1223 tp->vegas.cntRTT = 0;
1224 tp->vegas.minRTT = 0x7fffffff;
1225}
1226
1227/* Should we be taking Vegas samples right now? */
1228#define tcp_vegas_enabled(__tp) ((__tp)->vegas.doing_vegas_now)
1229
1230extern void tcp_ca_init(struct tcp_sock *tp);
1231
1232static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state)
1233{
1234 if (tcp_is_vegas(tp)) {
1235 if (ca_state == TCP_CA_Open)
1236 tcp_vegas_enable(tp);
1237 else
1238 tcp_vegas_disable(tp);
1239 }
1240 tp->ca_state = ca_state;
1241}
1242
1243/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. 1204/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
1244 * The exception is rate halving phase, when cwnd is decreasing towards 1205 * The exception is rate halving phase, when cwnd is decreasing towards
1245 * ssthresh. 1206 * ssthresh.
@@ -1288,7 +1249,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
1288static inline void __tcp_enter_cwr(struct tcp_sock *tp) 1249static inline void __tcp_enter_cwr(struct tcp_sock *tp)
1289{ 1250{
1290 tp->undo_marker = 0; 1251 tp->undo_marker = 0;
1291 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1252 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1292 tp->snd_cwnd = min(tp->snd_cwnd, 1253 tp->snd_cwnd = min(tp->snd_cwnd,
1293 tcp_packets_in_flight(tp) + 1U); 1254 tcp_packets_in_flight(tp) + 1U);
1294 tp->snd_cwnd_cnt = 0; 1255 tp->snd_cwnd_cnt = 0;
@@ -1876,52 +1837,4 @@ struct tcp_iter_state {
1876extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo); 1837extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo);
1877extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo); 1838extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo);
1878 1839
1879/* TCP Westwood functions and constants */
1880
1881#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
1882#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
1883
1884static inline void tcp_westwood_update_rtt(struct tcp_sock *tp, __u32 rtt_seq)
1885{
1886 if (tcp_is_westwood(tp))
1887 tp->westwood.rtt = rtt_seq;
1888}
1889
1890static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
1891{
1892 return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) /
1893 (__u32) (tp->mss_cache_std),
1894 2U);
1895}
1896
1897static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
1898{
1899 return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0;
1900}
1901
1902static inline int tcp_westwood_ssthresh(struct tcp_sock *tp)
1903{
1904 __u32 ssthresh = 0;
1905
1906 if (tcp_is_westwood(tp)) {
1907 ssthresh = __tcp_westwood_bw_rttmin(tp);
1908 if (ssthresh)
1909 tp->snd_ssthresh = ssthresh;
1910 }
1911
1912 return (ssthresh != 0);
1913}
1914
1915static inline int tcp_westwood_cwnd(struct tcp_sock *tp)
1916{
1917 __u32 cwnd = 0;
1918
1919 if (tcp_is_westwood(tp)) {
1920 cwnd = __tcp_westwood_bw_rttmin(tp);
1921 if (cwnd)
1922 tp->snd_cwnd = cwnd;
1923 }
1924
1925 return (cwnd != 0);
1926}
1927#endif /* _TCP_H */ 1840#endif /* _TCP_H */