diff options
-rw-r--r-- | net/ipv4/Kconfig | 21 | ||||
-rw-r--r-- | net/ipv4/Makefile | 1 | ||||
-rw-r--r-- | net/ipv4/tcp_bic.c | 331 |
3 files changed, 353 insertions, 0 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 567b03b1c349..712ebacacb62 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
@@ -433,5 +433,26 @@ config IP_TCPDIAG | |||
433 | config IP_TCPDIAG_IPV6 | 433 | config IP_TCPDIAG_IPV6 |
434 | def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) | 434 | def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) |
435 | 435 | ||
436 | # TCP Reno is builtin (required as fallback) | ||
437 | menu "TCP congestion control" | ||
438 | depends on INET | ||
439 | |||
440 | config TCP_CONG_BIC | ||
441 | tristate "Binary Increase Congestion (BIC) control" | ||
442 | depends on INET | ||
443 | default y | ||
444 | ---help--- | ||
445 | BIC-TCP is a sender-side only change that ensures a linear RTT | ||
446 | fairness under large windows while offering both scalability and | ||
447 | bounded TCP-friendliness. The protocol combines two schemes | ||
448 | called additive increase and binary search increase. When the | ||
449 | congestion window is large, additive increase with a large | ||
450 | increment ensures linear RTT fairness as well as good | ||
451 | scalability. Under small congestion windows, binary search | ||
452 | increase provides TCP friendliness. | ||
453 | See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ | ||
454 | |||
455 | endmenu | ||
456 | |||
436 | source "net/ipv4/ipvs/Kconfig" | 457 | source "net/ipv4/ipvs/Kconfig" |
437 | 458 | ||
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 89c0b4cb470e..1d1cac5ac06a 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
@@ -31,6 +31,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/ | |||
31 | obj-$(CONFIG_IP_VS) += ipvs/ | 31 | obj-$(CONFIG_IP_VS) += ipvs/ |
32 | obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o | 32 | obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o |
33 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o | 33 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o |
34 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o | ||
34 | 35 | ||
35 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ | 36 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ |
36 | xfrm4_output.o | 37 | xfrm4_output.o |
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c new file mode 100644 index 000000000000..ec38d45d6649 --- /dev/null +++ b/net/ipv4/tcp_bic.c | |||
@@ -0,0 +1,331 @@ | |||
1 | /* | ||
2 | * Binary Increase Congestion control for TCP | ||
3 | * | ||
4 | * This is from the implementation of BICTCP in | ||
5 | * Lison-Xu, Kahaled Harfoush, and Injong Rhee. | ||
6 | * "Binary Increase Congestion Control for Fast, Long Distance | ||
7 | * Networks" in InfoComm 2004 | ||
8 | * Available from: | ||
9 | * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf | ||
10 | * | ||
11 | * Unless BIC is enabled and congestion window is large | ||
12 | * this behaves the same as the original Reno. | ||
13 | */ | ||
14 | |||
15 | #include <linux/config.h> | ||
16 | #include <linux/mm.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <net/tcp.h> | ||
19 | |||
20 | |||
21 | #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation | ||
22 | * max_cwnd = snd_cwnd * beta | ||
23 | */ | ||
24 | #define BICTCP_B 4 /* | ||
25 | * In binary search, | ||
26 | * go to point (max+min)/N | ||
27 | */ | ||
28 | |||
29 | static int fast_convergence = 1; | ||
30 | static int max_increment = 32; | ||
31 | static int low_window = 14; | ||
32 | static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ | ||
33 | static int low_utilization_threshold = 153; | ||
34 | static int low_utilization_period = 2; | ||
35 | static int initial_ssthresh = 100; | ||
36 | static int smooth_part = 20; | ||
37 | |||
38 | module_param(fast_convergence, int, 0644); | ||
39 | MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); | ||
40 | module_param(max_increment, int, 0644); | ||
41 | MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); | ||
42 | module_param(low_window, int, 0644); | ||
43 | MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); | ||
44 | module_param(beta, int, 0644); | ||
45 | MODULE_PARM_DESC(beta, "beta for multiplicative increase"); | ||
46 | module_param(low_utilization_threshold, int, 0644); | ||
47 | MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode"); | ||
48 | module_param(low_utilization_period, int, 0644); | ||
49 | MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)"); | ||
50 | module_param(initial_ssthresh, int, 0644); | ||
51 | MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); | ||
52 | module_param(smooth_part, int, 0644); | ||
53 | MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); | ||
54 | |||
55 | |||
56 | /* BIC TCP Parameters */ | ||
57 | struct bictcp { | ||
58 | u32 cnt; /* increase cwnd by 1 after ACKs */ | ||
59 | u32 last_max_cwnd; /* last maximum snd_cwnd */ | ||
60 | u32 loss_cwnd; /* congestion window at last loss */ | ||
61 | u32 last_cwnd; /* the last snd_cwnd */ | ||
62 | u32 last_time; /* time when updated last_cwnd */ | ||
63 | u32 delay_min; /* min delay */ | ||
64 | u32 delay_max; /* max delay */ | ||
65 | u32 last_delay; | ||
66 | u8 low_utilization;/* 0: high; 1: low */ | ||
67 | u32 low_utilization_start; /* starting time of low utilization detection*/ | ||
68 | u32 epoch_start; /* beginning of an epoch */ | ||
69 | #define ACK_RATIO_SHIFT 4 | ||
70 | u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ | ||
71 | }; | ||
72 | |||
73 | static inline void bictcp_reset(struct bictcp *ca) | ||
74 | { | ||
75 | ca->cnt = 0; | ||
76 | ca->last_max_cwnd = 0; | ||
77 | ca->loss_cwnd = 0; | ||
78 | ca->last_cwnd = 0; | ||
79 | ca->last_time = 0; | ||
80 | ca->delay_min = 0; | ||
81 | ca->delay_max = 0; | ||
82 | ca->last_delay = 0; | ||
83 | ca->low_utilization = 0; | ||
84 | ca->low_utilization_start = 0; | ||
85 | ca->epoch_start = 0; | ||
86 | ca->delayed_ack = 2 << ACK_RATIO_SHIFT; | ||
87 | } | ||
88 | |||
89 | static void bictcp_init(struct tcp_sock *tp) | ||
90 | { | ||
91 | bictcp_reset(tcp_ca(tp)); | ||
92 | if (initial_ssthresh) | ||
93 | tp->snd_ssthresh = initial_ssthresh; | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * Compute congestion window to use. | ||
98 | */ | ||
99 | static inline void bictcp_update(struct bictcp *ca, u32 cwnd) | ||
100 | { | ||
101 | if (ca->last_cwnd == cwnd && | ||
102 | (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) | ||
103 | return; | ||
104 | |||
105 | ca->last_cwnd = cwnd; | ||
106 | ca->last_time = tcp_time_stamp; | ||
107 | |||
108 | if (ca->epoch_start == 0) /* record the beginning of an epoch */ | ||
109 | ca->epoch_start = tcp_time_stamp; | ||
110 | |||
111 | /* start off normal */ | ||
112 | if (cwnd <= low_window) { | ||
113 | ca->cnt = cwnd; | ||
114 | return; | ||
115 | } | ||
116 | |||
117 | /* binary increase */ | ||
118 | if (cwnd < ca->last_max_cwnd) { | ||
119 | __u32 dist = (ca->last_max_cwnd - cwnd) | ||
120 | / BICTCP_B; | ||
121 | |||
122 | if (dist > max_increment) | ||
123 | /* linear increase */ | ||
124 | ca->cnt = cwnd / max_increment; | ||
125 | else if (dist <= 1U) | ||
126 | /* binary search increase */ | ||
127 | ca->cnt = (cwnd * smooth_part) / BICTCP_B; | ||
128 | else | ||
129 | /* binary search increase */ | ||
130 | ca->cnt = cwnd / dist; | ||
131 | } else { | ||
132 | /* slow start AMD linear increase */ | ||
133 | if (cwnd < ca->last_max_cwnd + BICTCP_B) | ||
134 | /* slow start */ | ||
135 | ca->cnt = (cwnd * smooth_part) / BICTCP_B; | ||
136 | else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1)) | ||
137 | /* slow start */ | ||
138 | ca->cnt = (cwnd * (BICTCP_B-1)) | ||
139 | / cwnd-ca->last_max_cwnd; | ||
140 | else | ||
141 | /* linear increase */ | ||
142 | ca->cnt = cwnd / max_increment; | ||
143 | } | ||
144 | |||
145 | /* if in slow start or link utilization is very low */ | ||
146 | if ( ca->loss_cwnd == 0 || | ||
147 | (cwnd > ca->loss_cwnd && ca->low_utilization)) { | ||
148 | if (ca->cnt > 20) /* increase cwnd 5% per RTT */ | ||
149 | ca->cnt = 20; | ||
150 | } | ||
151 | |||
152 | ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; | ||
153 | if (ca->cnt == 0) /* cannot be zero */ | ||
154 | ca->cnt = 1; | ||
155 | } | ||
156 | |||
157 | |||
158 | /* Detect low utilization in congestion avoidance */ | ||
159 | static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) | ||
160 | { | ||
161 | struct bictcp *ca = tcp_ca(tp); | ||
162 | u32 dist, delay; | ||
163 | |||
164 | /* No time stamp */ | ||
165 | if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || | ||
166 | /* Discard delay samples right after fast recovery */ | ||
167 | tcp_time_stamp < ca->epoch_start + HZ || | ||
168 | /* this delay samples may not be accurate */ | ||
169 | flag == 0) { | ||
170 | ca->last_delay = 0; | ||
171 | goto notlow; | ||
172 | } | ||
173 | |||
174 | delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/ | ||
175 | ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | ||
176 | if (delay == 0) /* no previous delay sample */ | ||
177 | goto notlow; | ||
178 | |||
179 | /* first time call or link delay decreases */ | ||
180 | if (ca->delay_min == 0 || ca->delay_min > delay) { | ||
181 | ca->delay_min = ca->delay_max = delay; | ||
182 | goto notlow; | ||
183 | } | ||
184 | |||
185 | if (ca->delay_max < delay) | ||
186 | ca->delay_max = delay; | ||
187 | |||
188 | /* utilization is low, if avg delay < dist*threshold | ||
189 | for checking_period time */ | ||
190 | dist = ca->delay_max - ca->delay_min; | ||
191 | if (dist <= ca->delay_min>>6 || | ||
192 | tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10) | ||
193 | goto notlow; | ||
194 | |||
195 | if (ca->low_utilization_start == 0) { | ||
196 | ca->low_utilization = 0; | ||
197 | ca->low_utilization_start = tcp_time_stamp; | ||
198 | } else if ((s32)(tcp_time_stamp - ca->low_utilization_start) | ||
199 | > low_utilization_period*HZ) { | ||
200 | ca->low_utilization = 1; | ||
201 | } | ||
202 | |||
203 | return; | ||
204 | |||
205 | notlow: | ||
206 | ca->low_utilization = 0; | ||
207 | ca->low_utilization_start = 0; | ||
208 | |||
209 | } | ||
210 | |||
211 | static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, | ||
212 | u32 seq_rtt, u32 in_flight, int data_acked) | ||
213 | { | ||
214 | struct bictcp *ca = tcp_ca(tp); | ||
215 | |||
216 | bictcp_low_utilization(tp, data_acked); | ||
217 | |||
218 | if (in_flight < tp->snd_cwnd) | ||
219 | return; | ||
220 | |||
221 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | ||
222 | /* In "safe" area, increase. */ | ||
223 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
224 | tp->snd_cwnd++; | ||
225 | } else { | ||
226 | bictcp_update(ca, tp->snd_cwnd); | ||
227 | |||
228 | /* In dangerous area, increase slowly. | ||
229 | * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd | ||
230 | */ | ||
231 | if (tp->snd_cwnd_cnt >= ca->cnt) { | ||
232 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
233 | tp->snd_cwnd++; | ||
234 | tp->snd_cwnd_cnt = 0; | ||
235 | } else | ||
236 | tp->snd_cwnd_cnt++; | ||
237 | } | ||
238 | |||
239 | } | ||
240 | |||
241 | /* | ||
242 | * behave like Reno until low_window is reached, | ||
243 | * then increase congestion window slowly | ||
244 | */ | ||
245 | static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) | ||
246 | { | ||
247 | struct bictcp *ca = tcp_ca(tp); | ||
248 | |||
249 | ca->epoch_start = 0; /* end of epoch */ | ||
250 | |||
251 | /* in case of wrong delay_max*/ | ||
252 | if (ca->delay_min > 0 && ca->delay_max > ca->delay_min) | ||
253 | ca->delay_max = ca->delay_min | ||
254 | + ((ca->delay_max - ca->delay_min)* 90) / 100; | ||
255 | |||
256 | /* Wmax and fast convergence */ | ||
257 | if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) | ||
258 | ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) | ||
259 | / (2 * BICTCP_BETA_SCALE); | ||
260 | else | ||
261 | ca->last_max_cwnd = tp->snd_cwnd; | ||
262 | |||
263 | ca->loss_cwnd = tp->snd_cwnd; | ||
264 | |||
265 | |||
266 | if (tp->snd_cwnd <= low_window) | ||
267 | return max(tp->snd_cwnd >> 1U, 2U); | ||
268 | else | ||
269 | return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); | ||
270 | } | ||
271 | |||
272 | static u32 bictcp_undo_cwnd(struct tcp_sock *tp) | ||
273 | { | ||
274 | struct bictcp *ca = tcp_ca(tp); | ||
275 | |||
276 | return max(tp->snd_cwnd, ca->last_max_cwnd); | ||
277 | } | ||
278 | |||
279 | static u32 bictcp_min_cwnd(struct tcp_sock *tp) | ||
280 | { | ||
281 | return tp->snd_ssthresh; | ||
282 | } | ||
283 | |||
284 | static void bictcp_state(struct tcp_sock *tp, u8 new_state) | ||
285 | { | ||
286 | if (new_state == TCP_CA_Loss) | ||
287 | bictcp_reset(tcp_ca(tp)); | ||
288 | } | ||
289 | |||
290 | /* Track delayed acknowledgement ratio using sliding window | ||
291 | * ratio = (15*ratio + sample) / 16 | ||
292 | */ | ||
293 | static void bictcp_acked(struct tcp_sock *tp, u32 cnt) | ||
294 | { | ||
295 | if (cnt > 0 && tp->ca_state == TCP_CA_Open) { | ||
296 | struct bictcp *ca = tcp_ca(tp); | ||
297 | cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; | ||
298 | ca->delayed_ack += cnt; | ||
299 | } | ||
300 | } | ||
301 | |||
302 | |||
303 | static struct tcp_congestion_ops bictcp = { | ||
304 | .init = bictcp_init, | ||
305 | .ssthresh = bictcp_recalc_ssthresh, | ||
306 | .cong_avoid = bictcp_cong_avoid, | ||
307 | .set_state = bictcp_state, | ||
308 | .undo_cwnd = bictcp_undo_cwnd, | ||
309 | .min_cwnd = bictcp_min_cwnd, | ||
310 | .pkts_acked = bictcp_acked, | ||
311 | .owner = THIS_MODULE, | ||
312 | .name = "bic", | ||
313 | }; | ||
314 | |||
315 | static int __init bictcp_register(void) | ||
316 | { | ||
317 | BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE); | ||
318 | return tcp_register_congestion_control(&bictcp); | ||
319 | } | ||
320 | |||
321 | static void __exit bictcp_unregister(void) | ||
322 | { | ||
323 | tcp_unregister_congestion_control(&bictcp); | ||
324 | } | ||
325 | |||
326 | module_init(bictcp_register); | ||
327 | module_exit(bictcp_unregister); | ||
328 | |||
329 | MODULE_AUTHOR("Stephen Hemminger"); | ||
330 | MODULE_LICENSE("GPL"); | ||
331 | MODULE_DESCRIPTION("BIC TCP"); | ||