aboutsummaryrefslogtreecommitdiffstats
path: root/include/net
diff options
context:
space:
mode:
authorStephen Hemminger <shemminger@osdl.org>2005-06-23 15:19:55 -0400
committerDavid S. Miller <davem@davemloft.net>2005-06-23 15:19:55 -0400
commit317a76f9a44b437d6301718f4e5d08bd93f98da7 (patch)
treecaeba9839dee264f59b035b81c3d13d6c61b638e /include/net
parenta8ad86f2dc46356f87be1327dabc18bdbda32f50 (diff)
[TCP]: Add pluggable congestion control algorithm infrastructure.
Allow TCP to have multiple pluggable congestion control algorithms. Algorithms are defined by a set of operations and can be built in or modules. The legacy "new RENO" algorithm is used as a starting point and fallback. Signed-off-by: Stephen Hemminger <shemminger@osdl.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include/net')
-rw-r--r--include/net/tcp.h237
1 files changed, 75 insertions, 162 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f730935b824a..e427cf35915c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -505,25 +505,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
505#else 505#else
506# define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG) 506# define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG)
507#endif 507#endif
508
509#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
510 * max_cwnd = snd_cwnd * beta
511 */
512#define BICTCP_MAX_INCREMENT 32 /*
513 * Limit on the amount of
514 * increment allowed during
515 * binary search.
516 */
517#define BICTCP_FUNC_OF_MIN_INCR 11 /*
518 * log(B/Smin)/log(B/(B-1))+1,
519 * Smin:min increment
520 * B:log factor
521 */
522#define BICTCP_B 4 /*
523 * In binary search,
524 * go to point (max+min)/N
525 */
526
527/* 508/*
528 * TCP option 509 * TCP option
529 */ 510 */
@@ -596,16 +577,7 @@ extern int sysctl_tcp_adv_win_scale;
596extern int sysctl_tcp_tw_reuse; 577extern int sysctl_tcp_tw_reuse;
597extern int sysctl_tcp_frto; 578extern int sysctl_tcp_frto;
598extern int sysctl_tcp_low_latency; 579extern int sysctl_tcp_low_latency;
599extern int sysctl_tcp_westwood;
600extern int sysctl_tcp_vegas_cong_avoid;
601extern int sysctl_tcp_vegas_alpha;
602extern int sysctl_tcp_vegas_beta;
603extern int sysctl_tcp_vegas_gamma;
604extern int sysctl_tcp_nometrics_save; 580extern int sysctl_tcp_nometrics_save;
605extern int sysctl_tcp_bic;
606extern int sysctl_tcp_bic_fast_convergence;
607extern int sysctl_tcp_bic_low_window;
608extern int sysctl_tcp_bic_beta;
609extern int sysctl_tcp_moderate_rcvbuf; 581extern int sysctl_tcp_moderate_rcvbuf;
610extern int sysctl_tcp_tso_win_divisor; 582extern int sysctl_tcp_tso_win_divisor;
611 583
@@ -1136,6 +1108,80 @@ static inline void tcp_packets_out_dec(struct tcp_sock *tp,
1136 tp->packets_out -= tcp_skb_pcount(skb); 1108 tp->packets_out -= tcp_skb_pcount(skb);
1137} 1109}
1138 1110
1111/* Events passed to congestion control interface */
1112enum tcp_ca_event {
1113 CA_EVENT_TX_START, /* first transmit when no packets in flight */
1114 CA_EVENT_CWND_RESTART, /* congestion window restart */
1115 CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */
1116 CA_EVENT_FRTO, /* fast recovery timeout */
1117 CA_EVENT_LOSS, /* loss timeout */
1118 CA_EVENT_FAST_ACK, /* in sequence ack */
1119 CA_EVENT_SLOW_ACK, /* other ack */
1120};
1121
1122/*
1123 * Interface for adding new TCP congestion control handlers
1124 */
1125#define TCP_CA_NAME_MAX 16
1126struct tcp_congestion_ops {
1127 struct list_head list;
1128
1129 /* initialize private data (optional) */
1130 void (*init)(struct tcp_sock *tp);
1131 /* cleanup private data (optional) */
1132 void (*release)(struct tcp_sock *tp);
1133
1134 /* return slow start threshold (required) */
1135 u32 (*ssthresh)(struct tcp_sock *tp);
1136 /* lower bound for congestion window (optional) */
1137 u32 (*min_cwnd)(struct tcp_sock *tp);
1138 /* do new cwnd calculation (required) */
1139 void (*cong_avoid)(struct tcp_sock *tp, u32 ack,
1140 u32 rtt, u32 in_flight, int good_ack);
1141 /* round trip time sample per acked packet (optional) */
1142 void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt);
1143 /* call before changing ca_state (optional) */
1144 void (*set_state)(struct tcp_sock *tp, u8 new_state);
1145 /* call when cwnd event occurs (optional) */
1146 void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev);
1147 /* new value of cwnd after loss (optional) */
1148 u32 (*undo_cwnd)(struct tcp_sock *tp);
1149 /* hook for packet ack accounting (optional) */
1150 void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked);
1151 /* get info for tcp_diag (optional) */
1152 void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb);
1153
1154 char name[TCP_CA_NAME_MAX];
1155 struct module *owner;
1156};
1157
1158extern int tcp_register_congestion_control(struct tcp_congestion_ops *type);
1159extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
1160
1161extern void tcp_init_congestion_control(struct tcp_sock *tp);
1162extern void tcp_cleanup_congestion_control(struct tcp_sock *tp);
1163extern int tcp_set_default_congestion_control(const char *name);
1164extern void tcp_get_default_congestion_control(char *name);
1165
1166extern struct tcp_congestion_ops tcp_reno;
1167extern u32 tcp_reno_ssthresh(struct tcp_sock *tp);
1168extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack,
1169 u32 rtt, u32 in_flight, int flag);
1170extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp);
1171
1172static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state)
1173{
1174 if (tp->ca_ops->set_state)
1175 tp->ca_ops->set_state(tp, ca_state);
1176 tp->ca_state = ca_state;
1177}
1178
1179static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event)
1180{
1181 if (tp->ca_ops->cwnd_event)
1182 tp->ca_ops->cwnd_event(tp, event);
1183}
1184
1139/* This determines how many packets are "in the network" to the best 1185/* This determines how many packets are "in the network" to the best
1140 * of our knowledge. In many cases it is conservative, but where 1186 * of our knowledge. In many cases it is conservative, but where
1141 * detailed information is available from the receiver (via SACK 1187 * detailed information is available from the receiver (via SACK
@@ -1155,91 +1201,6 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
1155 return (tp->packets_out - tp->left_out + tp->retrans_out); 1201 return (tp->packets_out - tp->left_out + tp->retrans_out);
1156} 1202}
1157 1203
1158/*
1159 * Which congestion algorithim is in use on the connection.
1160 */
1161#define tcp_is_vegas(__tp) ((__tp)->adv_cong == TCP_VEGAS)
1162#define tcp_is_westwood(__tp) ((__tp)->adv_cong == TCP_WESTWOOD)
1163#define tcp_is_bic(__tp) ((__tp)->adv_cong == TCP_BIC)
1164
1165/* Recalculate snd_ssthresh, we want to set it to:
1166 *
1167 * Reno:
1168 * one half the current congestion window, but no
1169 * less than two segments
1170 *
1171 * BIC:
1172 * behave like Reno until low_window is reached,
1173 * then increase congestion window slowly
1174 */
1175static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp)
1176{
1177 if (tcp_is_bic(tp)) {
1178 if (sysctl_tcp_bic_fast_convergence &&
1179 tp->snd_cwnd < tp->bictcp.last_max_cwnd)
1180 tp->bictcp.last_max_cwnd = (tp->snd_cwnd *
1181 (BICTCP_BETA_SCALE
1182 + sysctl_tcp_bic_beta))
1183 / (2 * BICTCP_BETA_SCALE);
1184 else
1185 tp->bictcp.last_max_cwnd = tp->snd_cwnd;
1186
1187 if (tp->snd_cwnd > sysctl_tcp_bic_low_window)
1188 return max((tp->snd_cwnd * sysctl_tcp_bic_beta)
1189 / BICTCP_BETA_SCALE, 2U);
1190 }
1191
1192 return max(tp->snd_cwnd >> 1U, 2U);
1193}
1194
1195/* Stop taking Vegas samples for now. */
1196#define tcp_vegas_disable(__tp) ((__tp)->vegas.doing_vegas_now = 0)
1197
1198static inline void tcp_vegas_enable(struct tcp_sock *tp)
1199{
1200 /* There are several situations when we must "re-start" Vegas:
1201 *
1202 * o when a connection is established
1203 * o after an RTO
1204 * o after fast recovery
1205 * o when we send a packet and there is no outstanding
1206 * unacknowledged data (restarting an idle connection)
1207 *
1208 * In these circumstances we cannot do a Vegas calculation at the
1209 * end of the first RTT, because any calculation we do is using
1210 * stale info -- both the saved cwnd and congestion feedback are
1211 * stale.
1212 *
1213 * Instead we must wait until the completion of an RTT during
1214 * which we actually receive ACKs.
1215 */
1216
1217 /* Begin taking Vegas samples next time we send something. */
1218 tp->vegas.doing_vegas_now = 1;
1219
1220 /* Set the beginning of the next send window. */
1221 tp->vegas.beg_snd_nxt = tp->snd_nxt;
1222
1223 tp->vegas.cntRTT = 0;
1224 tp->vegas.minRTT = 0x7fffffff;
1225}
1226
1227/* Should we be taking Vegas samples right now? */
1228#define tcp_vegas_enabled(__tp) ((__tp)->vegas.doing_vegas_now)
1229
1230extern void tcp_ca_init(struct tcp_sock *tp);
1231
1232static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state)
1233{
1234 if (tcp_is_vegas(tp)) {
1235 if (ca_state == TCP_CA_Open)
1236 tcp_vegas_enable(tp);
1237 else
1238 tcp_vegas_disable(tp);
1239 }
1240 tp->ca_state = ca_state;
1241}
1242
1243/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. 1204/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
1244 * The exception is rate halving phase, when cwnd is decreasing towards 1205 * The exception is rate halving phase, when cwnd is decreasing towards
1245 * ssthresh. 1206 * ssthresh.
@@ -1288,7 +1249,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
1288static inline void __tcp_enter_cwr(struct tcp_sock *tp) 1249static inline void __tcp_enter_cwr(struct tcp_sock *tp)
1289{ 1250{
1290 tp->undo_marker = 0; 1251 tp->undo_marker = 0;
1291 tp->snd_ssthresh = tcp_recalc_ssthresh(tp); 1252 tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
1292 tp->snd_cwnd = min(tp->snd_cwnd, 1253 tp->snd_cwnd = min(tp->snd_cwnd,
1293 tcp_packets_in_flight(tp) + 1U); 1254 tcp_packets_in_flight(tp) + 1U);
1294 tp->snd_cwnd_cnt = 0; 1255 tp->snd_cwnd_cnt = 0;
@@ -1876,52 +1837,4 @@ struct tcp_iter_state {
1876extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo); 1837extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo);
1877extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo); 1838extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo);
1878 1839
1879/* TCP Westwood functions and constants */
1880
1881#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */
1882#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */
1883
1884static inline void tcp_westwood_update_rtt(struct tcp_sock *tp, __u32 rtt_seq)
1885{
1886 if (tcp_is_westwood(tp))
1887 tp->westwood.rtt = rtt_seq;
1888}
1889
1890static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
1891{
1892 return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) /
1893 (__u32) (tp->mss_cache_std),
1894 2U);
1895}
1896
1897static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
1898{
1899 return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0;
1900}
1901
1902static inline int tcp_westwood_ssthresh(struct tcp_sock *tp)
1903{
1904 __u32 ssthresh = 0;
1905
1906 if (tcp_is_westwood(tp)) {
1907 ssthresh = __tcp_westwood_bw_rttmin(tp);
1908 if (ssthresh)
1909 tp->snd_ssthresh = ssthresh;
1910 }
1911
1912 return (ssthresh != 0);
1913}
1914
1915static inline int tcp_westwood_cwnd(struct tcp_sock *tp)
1916{
1917 __u32 cwnd = 0;
1918
1919 if (tcp_is_westwood(tp)) {
1920 cwnd = __tcp_westwood_bw_rttmin(tp);
1921 if (cwnd)
1922 tp->snd_cwnd = cwnd;
1923 }
1924
1925 return (cwnd != 0);
1926}
1927#endif /* _TCP_H */ 1840#endif /* _TCP_H */