aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/tcp_output.c
diff options
context:
space:
mode:
authorPaul Mundt <lethal@linux-sh.org>2011-01-13 01:06:28 -0500
committerPaul Mundt <lethal@linux-sh.org>2011-01-13 01:06:28 -0500
commitf43dc23d5ea91fca257be02138a255f02d98e806 (patch)
treeb29722f6e965316e90ac97abf79923ced250dc21 /net/ipv4/tcp_output.c
parentf8e53553f452dcbf67cb89c8cba63a1cd6eb4cc0 (diff)
parent4162cf64973df51fc885825bc9ca4d055891c49f (diff)
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6 into common/serial-rework
Conflicts: arch/sh/kernel/cpu/sh2/setup-sh7619.c arch/sh/kernel/cpu/sh2a/setup-mxg.c arch/sh/kernel/cpu/sh2a/setup-sh7201.c arch/sh/kernel/cpu/sh2a/setup-sh7203.c arch/sh/kernel/cpu/sh2a/setup-sh7206.c arch/sh/kernel/cpu/sh3/setup-sh7705.c arch/sh/kernel/cpu/sh3/setup-sh770x.c arch/sh/kernel/cpu/sh3/setup-sh7710.c arch/sh/kernel/cpu/sh3/setup-sh7720.c arch/sh/kernel/cpu/sh4/setup-sh4-202.c arch/sh/kernel/cpu/sh4/setup-sh7750.c arch/sh/kernel/cpu/sh4/setup-sh7760.c arch/sh/kernel/cpu/sh4a/setup-sh7343.c arch/sh/kernel/cpu/sh4a/setup-sh7366.c arch/sh/kernel/cpu/sh4a/setup-sh7722.c arch/sh/kernel/cpu/sh4a/setup-sh7723.c arch/sh/kernel/cpu/sh4a/setup-sh7724.c arch/sh/kernel/cpu/sh4a/setup-sh7763.c arch/sh/kernel/cpu/sh4a/setup-sh7770.c arch/sh/kernel/cpu/sh4a/setup-sh7780.c arch/sh/kernel/cpu/sh4a/setup-sh7785.c arch/sh/kernel/cpu/sh4a/setup-sh7786.c arch/sh/kernel/cpu/sh4a/setup-shx3.c arch/sh/kernel/cpu/sh5/setup-sh5.c drivers/serial/sh-sci.c drivers/serial/sh-sci.h include/linux/serial_sci.h
Diffstat (limited to 'net/ipv4/tcp_output.c')
-rw-r--r--net/ipv4/tcp_output.c572
1 files changed, 421 insertions, 151 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 416fc4c2e7eb..dc7c096ddfef 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -37,6 +37,7 @@
37#include <net/tcp.h> 37#include <net/tcp.h>
38 38
39#include <linux/compiler.h> 39#include <linux/compiler.h>
40#include <linux/gfp.h>
40#include <linux/module.h> 41#include <linux/module.h>
41 42
42/* People can turn this off for buggy TCP's found in printers etc. */ 43/* People can turn this off for buggy TCP's found in printers etc. */
@@ -54,11 +55,16 @@ int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
54int sysctl_tcp_tso_win_divisor __read_mostly = 3; 55int sysctl_tcp_tso_win_divisor __read_mostly = 3;
55 56
56int sysctl_tcp_mtu_probing __read_mostly = 0; 57int sysctl_tcp_mtu_probing __read_mostly = 0;
57int sysctl_tcp_base_mss __read_mostly = 512; 58int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
58 59
59/* By default, RFC2861 behavior. */ 60/* By default, RFC2861 behavior. */
60int sysctl_tcp_slow_start_after_idle __read_mostly = 1; 61int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
61 62
63int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
64EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
65
66
67/* Account for new data that has been sent to the network. */
62static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) 68static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
63{ 69{
64 struct tcp_sock *tp = tcp_sk(sk); 70 struct tcp_sock *tp = tcp_sk(sk);
@@ -113,9 +119,13 @@ static __u16 tcp_advertise_mss(struct sock *sk)
113 struct dst_entry *dst = __sk_dst_get(sk); 119 struct dst_entry *dst = __sk_dst_get(sk);
114 int mss = tp->advmss; 120 int mss = tp->advmss;
115 121
116 if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) { 122 if (dst) {
117 mss = dst_metric(dst, RTAX_ADVMSS); 123 unsigned int metric = dst_metric_advmss(dst);
118 tp->advmss = mss; 124
125 if (metric < mss) {
126 mss = metric;
127 tp->advmss = mss;
128 }
119 } 129 }
120 130
121 return (__u16)mss; 131 return (__u16)mss;
@@ -142,6 +152,7 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
142 tp->snd_cwnd_used = 0; 152 tp->snd_cwnd_used = 0;
143} 153}
144 154
155/* Congestion state accounting after a packet has been sent. */
145static void tcp_event_data_sent(struct tcp_sock *tp, 156static void tcp_event_data_sent(struct tcp_sock *tp,
146 struct sk_buff *skb, struct sock *sk) 157 struct sk_buff *skb, struct sock *sk)
147{ 158{
@@ -161,6 +172,7 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
161 icsk->icsk_ack.pingpong = 1; 172 icsk->icsk_ack.pingpong = 1;
162} 173}
163 174
175/* Account for an ACK we sent. */
164static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) 176static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
165{ 177{
166 tcp_dec_quickack_mode(sk, pkts); 178 tcp_dec_quickack_mode(sk, pkts);
@@ -176,7 +188,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
176 */ 188 */
177void tcp_select_initial_window(int __space, __u32 mss, 189void tcp_select_initial_window(int __space, __u32 mss,
178 __u32 *rcv_wnd, __u32 *window_clamp, 190 __u32 *rcv_wnd, __u32 *window_clamp,
179 int wscale_ok, __u8 *rcv_wscale) 191 int wscale_ok, __u8 *rcv_wscale,
192 __u32 init_rcv_wnd)
180{ 193{
181 unsigned int space = (__space < 0 ? 0 : __space); 194 unsigned int space = (__space < 0 ? 0 : __space);
182 195
@@ -215,23 +228,28 @@ void tcp_select_initial_window(int __space, __u32 mss,
215 } 228 }
216 } 229 }
217 230
218 /* Set initial window to value enough for senders, 231 /* Set initial window to a value enough for senders starting with
219 * following RFC2414. Senders, not following this RFC, 232 * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place
220 * will be satisfied with 2. 233 * a limit on the initial window when mss is larger than 1460.
221 */ 234 */
222 if (mss > (1 << *rcv_wscale)) { 235 if (mss > (1 << *rcv_wscale)) {
223 int init_cwnd = 4; 236 int init_cwnd = TCP_DEFAULT_INIT_RCVWND;
224 if (mss > 1460 * 3) 237 if (mss > 1460)
225 init_cwnd = 2; 238 init_cwnd =
226 else if (mss > 1460) 239 max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
227 init_cwnd = 3; 240 /* when initializing use the value from init_rcv_wnd
228 if (*rcv_wnd > init_cwnd * mss) 241 * rather than the default from above
229 *rcv_wnd = init_cwnd * mss; 242 */
243 if (init_rcv_wnd)
244 *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
245 else
246 *rcv_wnd = min(*rcv_wnd, init_cwnd * mss);
230 } 247 }
231 248
232 /* Set the clamp no higher than max representable value */ 249 /* Set the clamp no higher than max representable value */
233 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); 250 (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
234} 251}
252EXPORT_SYMBOL(tcp_select_initial_window);
235 253
236/* Chose a new window to advertise, update state in tcp_sock for the 254/* Chose a new window to advertise, update state in tcp_sock for the
237 * socket, and return result with RFC1323 scaling applied. The return 255 * socket, and return result with RFC1323 scaling applied. The return
@@ -276,20 +294,22 @@ static u16 tcp_select_window(struct sock *sk)
276 return new_win; 294 return new_win;
277} 295}
278 296
297/* Packet ECN state for a SYN-ACK */
279static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) 298static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
280{ 299{
281 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; 300 TCP_SKB_CB(skb)->flags &= ~TCPHDR_CWR;
282 if (!(tp->ecn_flags & TCP_ECN_OK)) 301 if (!(tp->ecn_flags & TCP_ECN_OK))
283 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; 302 TCP_SKB_CB(skb)->flags &= ~TCPHDR_ECE;
284} 303}
285 304
305/* Packet ECN state for a SYN. */
286static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) 306static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
287{ 307{
288 struct tcp_sock *tp = tcp_sk(sk); 308 struct tcp_sock *tp = tcp_sk(sk);
289 309
290 tp->ecn_flags = 0; 310 tp->ecn_flags = 0;
291 if (sysctl_tcp_ecn == 1) { 311 if (sysctl_tcp_ecn == 1) {
292 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR; 312 TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR;
293 tp->ecn_flags = TCP_ECN_OK; 313 tp->ecn_flags = TCP_ECN_OK;
294 } 314 }
295} 315}
@@ -301,6 +321,9 @@ TCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th)
301 th->ece = 1; 321 th->ece = 1;
302} 322}
303 323
324/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
325 * be sent.
326 */
304static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, 327static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
305 int tcp_header_len) 328 int tcp_header_len)
306{ 329{
@@ -330,6 +353,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
330 */ 353 */
331static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) 354static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
332{ 355{
356 skb->ip_summed = CHECKSUM_PARTIAL;
333 skb->csum = 0; 357 skb->csum = 0;
334 358
335 TCP_SKB_CB(skb)->flags = flags; 359 TCP_SKB_CB(skb)->flags = flags;
@@ -340,7 +364,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
340 skb_shinfo(skb)->gso_type = 0; 364 skb_shinfo(skb)->gso_type = 0;
341 365
342 TCP_SKB_CB(skb)->seq = seq; 366 TCP_SKB_CB(skb)->seq = seq;
343 if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN)) 367 if (flags & (TCPHDR_SYN | TCPHDR_FIN))
344 seq++; 368 seq++;
345 TCP_SKB_CB(skb)->end_seq = seq; 369 TCP_SKB_CB(skb)->end_seq = seq;
346} 370}
@@ -353,16 +377,52 @@ static inline int tcp_urg_mode(const struct tcp_sock *tp)
353#define OPTION_SACK_ADVERTISE (1 << 0) 377#define OPTION_SACK_ADVERTISE (1 << 0)
354#define OPTION_TS (1 << 1) 378#define OPTION_TS (1 << 1)
355#define OPTION_MD5 (1 << 2) 379#define OPTION_MD5 (1 << 2)
380#define OPTION_WSCALE (1 << 3)
381#define OPTION_COOKIE_EXTENSION (1 << 4)
356 382
357struct tcp_out_options { 383struct tcp_out_options {
358 u8 options; /* bit field of OPTION_* */ 384 u8 options; /* bit field of OPTION_* */
359 u8 ws; /* window scale, 0 to disable */ 385 u8 ws; /* window scale, 0 to disable */
360 u8 num_sack_blocks; /* number of SACK blocks to include */ 386 u8 num_sack_blocks; /* number of SACK blocks to include */
387 u8 hash_size; /* bytes in hash_location */
361 u16 mss; /* 0 to disable */ 388 u16 mss; /* 0 to disable */
362 __u32 tsval, tsecr; /* need to include OPTION_TS */ 389 __u32 tsval, tsecr; /* need to include OPTION_TS */
390 __u8 *hash_location; /* temporary pointer, overloaded */
363}; 391};
364 392
365/* Beware: Something in the Internet is very sensitive to the ordering of 393/* The sysctl int routines are generic, so check consistency here.
394 */
395static u8 tcp_cookie_size_check(u8 desired)
396{
397 int cookie_size;
398
399 if (desired > 0)
400 /* previously specified */
401 return desired;
402
403 cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size);
404 if (cookie_size <= 0)
405 /* no default specified */
406 return 0;
407
408 if (cookie_size <= TCP_COOKIE_MIN)
409 /* value too small, specify minimum */
410 return TCP_COOKIE_MIN;
411
412 if (cookie_size >= TCP_COOKIE_MAX)
413 /* value too large, specify maximum */
414 return TCP_COOKIE_MAX;
415
416 if (cookie_size & 1)
417 /* 8-bit multiple, illegal, fix it */
418 cookie_size++;
419
420 return (u8)cookie_size;
421}
422
423/* Write previously computed TCP options to the packet.
424 *
425 * Beware: Something in the Internet is very sensitive to the ordering of
366 * TCP options, we learned this through the hard way, so be careful here. 426 * TCP options, we learned this through the hard way, so be careful here.
367 * Luckily we can at least blame others for their non-compliance but from 427 * Luckily we can at least blame others for their non-compliance but from
368 * inter-operatibility perspective it seems that we're somewhat stuck with 428 * inter-operatibility perspective it seems that we're somewhat stuck with
@@ -374,17 +434,34 @@ struct tcp_out_options {
374 * (but it may well be that other scenarios fail similarly). 434 * (but it may well be that other scenarios fail similarly).
375 */ 435 */
376static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, 436static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
377 const struct tcp_out_options *opts, 437 struct tcp_out_options *opts)
378 __u8 **md5_hash) { 438{
379 if (unlikely(OPTION_MD5 & opts->options)) { 439 u8 options = opts->options; /* mungable copy */
380 *ptr++ = htonl((TCPOPT_NOP << 24) | 440
381 (TCPOPT_NOP << 16) | 441 /* Having both authentication and cookies for security is redundant,
382 (TCPOPT_MD5SIG << 8) | 442 * and there's certainly not enough room. Instead, the cookie-less
383 TCPOLEN_MD5SIG); 443 * extension variant is proposed.
384 *md5_hash = (__u8 *)ptr; 444 *
445 * Consider the pessimal case with authentication. The options
446 * could look like:
447 * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40
448 */
449 if (unlikely(OPTION_MD5 & options)) {
450 if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
451 *ptr++ = htonl((TCPOPT_COOKIE << 24) |
452 (TCPOLEN_COOKIE_BASE << 16) |
453 (TCPOPT_MD5SIG << 8) |
454 TCPOLEN_MD5SIG);
455 } else {
456 *ptr++ = htonl((TCPOPT_NOP << 24) |
457 (TCPOPT_NOP << 16) |
458 (TCPOPT_MD5SIG << 8) |
459 TCPOLEN_MD5SIG);
460 }
461 options &= ~OPTION_COOKIE_EXTENSION;
462 /* overload cookie hash location */
463 opts->hash_location = (__u8 *)ptr;
385 ptr += 4; 464 ptr += 4;
386 } else {
387 *md5_hash = NULL;
388 } 465 }
389 466
390 if (unlikely(opts->mss)) { 467 if (unlikely(opts->mss)) {
@@ -393,12 +470,13 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
393 opts->mss); 470 opts->mss);
394 } 471 }
395 472
396 if (likely(OPTION_TS & opts->options)) { 473 if (likely(OPTION_TS & options)) {
397 if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) { 474 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
398 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | 475 *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
399 (TCPOLEN_SACK_PERM << 16) | 476 (TCPOLEN_SACK_PERM << 16) |
400 (TCPOPT_TIMESTAMP << 8) | 477 (TCPOPT_TIMESTAMP << 8) |
401 TCPOLEN_TIMESTAMP); 478 TCPOLEN_TIMESTAMP);
479 options &= ~OPTION_SACK_ADVERTISE;
402 } else { 480 } else {
403 *ptr++ = htonl((TCPOPT_NOP << 24) | 481 *ptr++ = htonl((TCPOPT_NOP << 24) |
404 (TCPOPT_NOP << 16) | 482 (TCPOPT_NOP << 16) |
@@ -409,15 +487,52 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
409 *ptr++ = htonl(opts->tsecr); 487 *ptr++ = htonl(opts->tsecr);
410 } 488 }
411 489
412 if (unlikely(OPTION_SACK_ADVERTISE & opts->options && 490 /* Specification requires after timestamp, so do it now.
413 !(OPTION_TS & opts->options))) { 491 *
492 * Consider the pessimal case without authentication. The options
493 * could look like:
494 * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40
495 */
496 if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
497 __u8 *cookie_copy = opts->hash_location;
498 u8 cookie_size = opts->hash_size;
499
500 /* 8-bit multiple handled in tcp_cookie_size_check() above,
501 * and elsewhere.
502 */
503 if (0x2 & cookie_size) {
504 __u8 *p = (__u8 *)ptr;
505
506 /* 16-bit multiple */
507 *p++ = TCPOPT_COOKIE;
508 *p++ = TCPOLEN_COOKIE_BASE + cookie_size;
509 *p++ = *cookie_copy++;
510 *p++ = *cookie_copy++;
511 ptr++;
512 cookie_size -= 2;
513 } else {
514 /* 32-bit multiple */
515 *ptr++ = htonl(((TCPOPT_NOP << 24) |
516 (TCPOPT_NOP << 16) |
517 (TCPOPT_COOKIE << 8) |
518 TCPOLEN_COOKIE_BASE) +
519 cookie_size);
520 }
521
522 if (cookie_size > 0) {
523 memcpy(ptr, cookie_copy, cookie_size);
524 ptr += (cookie_size / 4);
525 }
526 }
527
528 if (unlikely(OPTION_SACK_ADVERTISE & options)) {
414 *ptr++ = htonl((TCPOPT_NOP << 24) | 529 *ptr++ = htonl((TCPOPT_NOP << 24) |
415 (TCPOPT_NOP << 16) | 530 (TCPOPT_NOP << 16) |
416 (TCPOPT_SACK_PERM << 8) | 531 (TCPOPT_SACK_PERM << 8) |
417 TCPOLEN_SACK_PERM); 532 TCPOLEN_SACK_PERM);
418 } 533 }
419 534
420 if (unlikely(opts->ws)) { 535 if (unlikely(OPTION_WSCALE & options)) {
421 *ptr++ = htonl((TCPOPT_NOP << 24) | 536 *ptr++ = htonl((TCPOPT_NOP << 24) |
422 (TCPOPT_WINDOW << 16) | 537 (TCPOPT_WINDOW << 16) |
423 (TCPOLEN_WINDOW << 8) | 538 (TCPOLEN_WINDOW << 8) |
@@ -445,17 +560,24 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
445 } 560 }
446} 561}
447 562
563/* Compute TCP options for SYN packets. This is not the final
564 * network wire format yet.
565 */
448static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, 566static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
449 struct tcp_out_options *opts, 567 struct tcp_out_options *opts,
450 struct tcp_md5sig_key **md5) { 568 struct tcp_md5sig_key **md5) {
451 struct tcp_sock *tp = tcp_sk(sk); 569 struct tcp_sock *tp = tcp_sk(sk);
452 unsigned size = 0; 570 struct tcp_cookie_values *cvp = tp->cookie_values;
571 unsigned remaining = MAX_TCP_OPTION_SPACE;
572 u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
573 tcp_cookie_size_check(cvp->cookie_desired) :
574 0;
453 575
454#ifdef CONFIG_TCP_MD5SIG 576#ifdef CONFIG_TCP_MD5SIG
455 *md5 = tp->af_specific->md5_lookup(sk, sk); 577 *md5 = tp->af_specific->md5_lookup(sk, sk);
456 if (*md5) { 578 if (*md5) {
457 opts->options |= OPTION_MD5; 579 opts->options |= OPTION_MD5;
458 size += TCPOLEN_MD5SIG_ALIGNED; 580 remaining -= TCPOLEN_MD5SIG_ALIGNED;
459 } 581 }
460#else 582#else
461 *md5 = NULL; 583 *md5 = NULL;
@@ -471,76 +593,154 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
471 * SACKs don't matter, we never delay an ACK when we have any of those 593 * SACKs don't matter, we never delay an ACK when we have any of those
472 * going out. */ 594 * going out. */
473 opts->mss = tcp_advertise_mss(sk); 595 opts->mss = tcp_advertise_mss(sk);
474 size += TCPOLEN_MSS_ALIGNED; 596 remaining -= TCPOLEN_MSS_ALIGNED;
475 597
476 if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { 598 if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
477 opts->options |= OPTION_TS; 599 opts->options |= OPTION_TS;
478 opts->tsval = TCP_SKB_CB(skb)->when; 600 opts->tsval = TCP_SKB_CB(skb)->when;
479 opts->tsecr = tp->rx_opt.ts_recent; 601 opts->tsecr = tp->rx_opt.ts_recent;
480 size += TCPOLEN_TSTAMP_ALIGNED; 602 remaining -= TCPOLEN_TSTAMP_ALIGNED;
481 } 603 }
482 if (likely(sysctl_tcp_window_scaling)) { 604 if (likely(sysctl_tcp_window_scaling)) {
483 opts->ws = tp->rx_opt.rcv_wscale; 605 opts->ws = tp->rx_opt.rcv_wscale;
484 if (likely(opts->ws)) 606 opts->options |= OPTION_WSCALE;
485 size += TCPOLEN_WSCALE_ALIGNED; 607 remaining -= TCPOLEN_WSCALE_ALIGNED;
486 } 608 }
487 if (likely(sysctl_tcp_sack)) { 609 if (likely(sysctl_tcp_sack)) {
488 opts->options |= OPTION_SACK_ADVERTISE; 610 opts->options |= OPTION_SACK_ADVERTISE;
489 if (unlikely(!(OPTION_TS & opts->options))) 611 if (unlikely(!(OPTION_TS & opts->options)))
490 size += TCPOLEN_SACKPERM_ALIGNED; 612 remaining -= TCPOLEN_SACKPERM_ALIGNED;
491 } 613 }
492 614
493 return size; 615 /* Note that timestamps are required by the specification.
616 *
617 * Odd numbers of bytes are prohibited by the specification, ensuring
618 * that the cookie is 16-bit aligned, and the resulting cookie pair is
619 * 32-bit aligned.
620 */
621 if (*md5 == NULL &&
622 (OPTION_TS & opts->options) &&
623 cookie_size > 0) {
624 int need = TCPOLEN_COOKIE_BASE + cookie_size;
625
626 if (0x2 & need) {
627 /* 32-bit multiple */
628 need += 2; /* NOPs */
629
630 if (need > remaining) {
631 /* try shrinking cookie to fit */
632 cookie_size -= 2;
633 need -= 4;
634 }
635 }
636 while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
637 cookie_size -= 4;
638 need -= 4;
639 }
640 if (TCP_COOKIE_MIN <= cookie_size) {
641 opts->options |= OPTION_COOKIE_EXTENSION;
642 opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
643 opts->hash_size = cookie_size;
644
645 /* Remember for future incarnations. */
646 cvp->cookie_desired = cookie_size;
647
648 if (cvp->cookie_desired != cvp->cookie_pair_size) {
649 /* Currently use random bytes as a nonce,
650 * assuming these are completely unpredictable
651 * by hostile users of the same system.
652 */
653 get_random_bytes(&cvp->cookie_pair[0],
654 cookie_size);
655 cvp->cookie_pair_size = cookie_size;
656 }
657
658 remaining -= need;
659 }
660 }
661 return MAX_TCP_OPTION_SPACE - remaining;
494} 662}
495 663
664/* Set up TCP options for SYN-ACKs. */
496static unsigned tcp_synack_options(struct sock *sk, 665static unsigned tcp_synack_options(struct sock *sk,
497 struct request_sock *req, 666 struct request_sock *req,
498 unsigned mss, struct sk_buff *skb, 667 unsigned mss, struct sk_buff *skb,
499 struct tcp_out_options *opts, 668 struct tcp_out_options *opts,
500 struct tcp_md5sig_key **md5) { 669 struct tcp_md5sig_key **md5,
501 unsigned size = 0; 670 struct tcp_extend_values *xvp)
671{
502 struct inet_request_sock *ireq = inet_rsk(req); 672 struct inet_request_sock *ireq = inet_rsk(req);
503 char doing_ts; 673 unsigned remaining = MAX_TCP_OPTION_SPACE;
674 u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
675 xvp->cookie_plus :
676 0;
504 677
505#ifdef CONFIG_TCP_MD5SIG 678#ifdef CONFIG_TCP_MD5SIG
506 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); 679 *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
507 if (*md5) { 680 if (*md5) {
508 opts->options |= OPTION_MD5; 681 opts->options |= OPTION_MD5;
509 size += TCPOLEN_MD5SIG_ALIGNED; 682 remaining -= TCPOLEN_MD5SIG_ALIGNED;
683
684 /* We can't fit any SACK blocks in a packet with MD5 + TS
685 * options. There was discussion about disabling SACK
686 * rather than TS in order to fit in better with old,
687 * buggy kernels, but that was deemed to be unnecessary.
688 */
689 ireq->tstamp_ok &= !ireq->sack_ok;
510 } 690 }
511#else 691#else
512 *md5 = NULL; 692 *md5 = NULL;
513#endif 693#endif
514 694
515 /* we can't fit any SACK blocks in a packet with MD5 + TS 695 /* We always send an MSS option. */
516 options. There was discussion about disabling SACK rather than TS in
517 order to fit in better with old, buggy kernels, but that was deemed
518 to be unnecessary. */
519 doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok);
520
521 opts->mss = mss; 696 opts->mss = mss;
522 size += TCPOLEN_MSS_ALIGNED; 697 remaining -= TCPOLEN_MSS_ALIGNED;
523 698
524 if (likely(ireq->wscale_ok)) { 699 if (likely(ireq->wscale_ok)) {
525 opts->ws = ireq->rcv_wscale; 700 opts->ws = ireq->rcv_wscale;
526 if (likely(opts->ws)) 701 opts->options |= OPTION_WSCALE;
527 size += TCPOLEN_WSCALE_ALIGNED; 702 remaining -= TCPOLEN_WSCALE_ALIGNED;
528 } 703 }
529 if (likely(doing_ts)) { 704 if (likely(ireq->tstamp_ok)) {
530 opts->options |= OPTION_TS; 705 opts->options |= OPTION_TS;
531 opts->tsval = TCP_SKB_CB(skb)->when; 706 opts->tsval = TCP_SKB_CB(skb)->when;
532 opts->tsecr = req->ts_recent; 707 opts->tsecr = req->ts_recent;
533 size += TCPOLEN_TSTAMP_ALIGNED; 708 remaining -= TCPOLEN_TSTAMP_ALIGNED;
534 } 709 }
535 if (likely(ireq->sack_ok)) { 710 if (likely(ireq->sack_ok)) {
536 opts->options |= OPTION_SACK_ADVERTISE; 711 opts->options |= OPTION_SACK_ADVERTISE;
537 if (unlikely(!doing_ts)) 712 if (unlikely(!ireq->tstamp_ok))
538 size += TCPOLEN_SACKPERM_ALIGNED; 713 remaining -= TCPOLEN_SACKPERM_ALIGNED;
539 } 714 }
540 715
541 return size; 716 /* Similar rationale to tcp_syn_options() applies here, too.
717 * If the <SYN> options fit, the same options should fit now!
718 */
719 if (*md5 == NULL &&
720 ireq->tstamp_ok &&
721 cookie_plus > TCPOLEN_COOKIE_BASE) {
722 int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
723
724 if (0x2 & need) {
725 /* 32-bit multiple */
726 need += 2; /* NOPs */
727 }
728 if (need <= remaining) {
729 opts->options |= OPTION_COOKIE_EXTENSION;
730 opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
731 remaining -= need;
732 } else {
733 /* There's no error return, so flag it. */
734 xvp->cookie_out_never = 1; /* true */
735 opts->hash_size = 0;
736 }
737 }
738 return MAX_TCP_OPTION_SPACE - remaining;
542} 739}
543 740
741/* Compute TCP options for ESTABLISHED sockets. This is not the
742 * final wire format yet.
743 */
544static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, 744static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
545 struct tcp_out_options *opts, 745 struct tcp_out_options *opts,
546 struct tcp_md5sig_key **md5) { 746 struct tcp_md5sig_key **md5) {
@@ -601,7 +801,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
601 struct tcp_out_options opts; 801 struct tcp_out_options opts;
602 unsigned tcp_options_size, tcp_header_size; 802 unsigned tcp_options_size, tcp_header_size;
603 struct tcp_md5sig_key *md5; 803 struct tcp_md5sig_key *md5;
604 __u8 *md5_hash_location;
605 struct tcphdr *th; 804 struct tcphdr *th;
606 int err; 805 int err;
607 806
@@ -627,15 +826,18 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
627 tcb = TCP_SKB_CB(skb); 826 tcb = TCP_SKB_CB(skb);
628 memset(&opts, 0, sizeof(opts)); 827 memset(&opts, 0, sizeof(opts));
629 828
630 if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) 829 if (unlikely(tcb->flags & TCPHDR_SYN))
631 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); 830 tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
632 else 831 else
633 tcp_options_size = tcp_established_options(sk, skb, &opts, 832 tcp_options_size = tcp_established_options(sk, skb, &opts,
634 &md5); 833 &md5);
635 tcp_header_size = tcp_options_size + sizeof(struct tcphdr); 834 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
636 835
637 if (tcp_packets_in_flight(tp) == 0) 836 if (tcp_packets_in_flight(tp) == 0) {
638 tcp_ca_event(sk, CA_EVENT_TX_START); 837 tcp_ca_event(sk, CA_EVENT_TX_START);
838 skb->ooo_okay = 1;
839 } else
840 skb->ooo_okay = 0;
639 841
640 skb_push(skb, tcp_header_size); 842 skb_push(skb, tcp_header_size);
641 skb_reset_transport_header(skb); 843 skb_reset_transport_header(skb);
@@ -643,14 +845,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
643 845
644 /* Build TCP header and checksum it. */ 846 /* Build TCP header and checksum it. */
645 th = tcp_hdr(skb); 847 th = tcp_hdr(skb);
646 th->source = inet->sport; 848 th->source = inet->inet_sport;
647 th->dest = inet->dport; 849 th->dest = inet->inet_dport;
648 th->seq = htonl(tcb->seq); 850 th->seq = htonl(tcb->seq);
649 th->ack_seq = htonl(tp->rcv_nxt); 851 th->ack_seq = htonl(tp->rcv_nxt);
650 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | 852 *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
651 tcb->flags); 853 tcb->flags);
652 854
653 if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { 855 if (unlikely(tcb->flags & TCPHDR_SYN)) {
654 /* RFC1323: The window in SYN & SYN/ACK segments 856 /* RFC1323: The window in SYN & SYN/ACK segments
655 * is never scaled. 857 * is never scaled.
656 */ 858 */
@@ -667,36 +869,37 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
667 th->urg_ptr = htons(tp->snd_up - tcb->seq); 869 th->urg_ptr = htons(tp->snd_up - tcb->seq);
668 th->urg = 1; 870 th->urg = 1;
669 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { 871 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
670 th->urg_ptr = 0xFFFF; 872 th->urg_ptr = htons(0xFFFF);
671 th->urg = 1; 873 th->urg = 1;
672 } 874 }
673 } 875 }
674 876
675 tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); 877 tcp_options_write((__be32 *)(th + 1), tp, &opts);
676 if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) 878 if (likely((tcb->flags & TCPHDR_SYN) == 0))
677 TCP_ECN_send(sk, skb, tcp_header_size); 879 TCP_ECN_send(sk, skb, tcp_header_size);
678 880
679#ifdef CONFIG_TCP_MD5SIG 881#ifdef CONFIG_TCP_MD5SIG
680 /* Calculate the MD5 hash, as we have all we need now */ 882 /* Calculate the MD5 hash, as we have all we need now */
681 if (md5) { 883 if (md5) {
682 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 884 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
683 tp->af_specific->calc_md5_hash(md5_hash_location, 885 tp->af_specific->calc_md5_hash(opts.hash_location,
684 md5, sk, NULL, skb); 886 md5, sk, NULL, skb);
685 } 887 }
686#endif 888#endif
687 889
688 icsk->icsk_af_ops->send_check(sk, skb->len, skb); 890 icsk->icsk_af_ops->send_check(sk, skb);
689 891
690 if (likely(tcb->flags & TCPCB_FLAG_ACK)) 892 if (likely(tcb->flags & TCPHDR_ACK))
691 tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); 893 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
692 894
693 if (skb->len != tcp_header_size) 895 if (skb->len != tcp_header_size)
694 tcp_event_data_sent(tp, skb, sk); 896 tcp_event_data_sent(tp, skb, sk);
695 897
696 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) 898 if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
697 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); 899 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
900 tcp_skb_pcount(skb));
698 901
699 err = icsk->icsk_af_ops->queue_xmit(skb, 0); 902 err = icsk->icsk_af_ops->queue_xmit(skb);
700 if (likely(err <= 0)) 903 if (likely(err <= 0))
701 return err; 904 return err;
702 905
@@ -705,7 +908,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
705 return net_xmit_eval(err); 908 return net_xmit_eval(err);
706} 909}
707 910
708/* This routine just queue's the buffer 911/* This routine just queues the buffer for sending.
709 * 912 *
710 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, 913 * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
711 * otherwise socket can stall. 914 * otherwise socket can stall.
@@ -722,10 +925,12 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
722 sk_mem_charge(sk, skb->truesize); 925 sk_mem_charge(sk, skb->truesize);
723} 926}
724 927
928/* Initialize TSO segments for a packet. */
725static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, 929static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb,
726 unsigned int mss_now) 930 unsigned int mss_now)
727{ 931{
728 if (skb->len <= mss_now || !sk_can_gso(sk)) { 932 if (skb->len <= mss_now || !sk_can_gso(sk) ||
933 skb->ip_summed == CHECKSUM_NONE) {
729 /* Avoid the costly divide in the normal 934 /* Avoid the costly divide in the normal
730 * non-TSO case. 935 * non-TSO case.
731 */ 936 */
@@ -827,7 +1032,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
827 1032
828 /* PSH and FIN should only be set in the second packet. */ 1033 /* PSH and FIN should only be set in the second packet. */
829 flags = TCP_SKB_CB(skb)->flags; 1034 flags = TCP_SKB_CB(skb)->flags;
830 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); 1035 TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
831 TCP_SKB_CB(buff)->flags = flags; 1036 TCP_SKB_CB(buff)->flags = flags;
832 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; 1037 TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
833 1038
@@ -908,6 +1113,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
908 skb->len = skb->data_len; 1113 skb->len = skb->data_len;
909} 1114}
910 1115
1116/* Remove acked data from a packet in the transmit queue. */
911int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) 1117int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
912{ 1118{
913 if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) 1119 if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
@@ -936,7 +1142,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
936 return 0; 1142 return 0;
937} 1143}
938 1144
939/* Not accounting for SACKs here. */ 1145/* Calculate MSS. Not accounting for SACKs here. */
940int tcp_mtu_to_mss(struct sock *sk, int pmtu) 1146int tcp_mtu_to_mss(struct sock *sk, int pmtu)
941{ 1147{
942 struct tcp_sock *tp = tcp_sk(sk); 1148 struct tcp_sock *tp = tcp_sk(sk);
@@ -980,6 +1186,7 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
980 return mtu; 1186 return mtu;
981} 1187}
982 1188
1189/* MTU probing init per socket */
983void tcp_mtup_init(struct sock *sk) 1190void tcp_mtup_init(struct sock *sk)
984{ 1191{
985 struct tcp_sock *tp = tcp_sk(sk); 1192 struct tcp_sock *tp = tcp_sk(sk);
@@ -991,6 +1198,7 @@ void tcp_mtup_init(struct sock *sk)
991 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); 1198 icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
992 icsk->icsk_mtup.probe_size = 0; 1199 icsk->icsk_mtup.probe_size = 0;
993} 1200}
1201EXPORT_SYMBOL(tcp_mtup_init);
994 1202
995/* This function synchronize snd mss to current pmtu/exthdr set. 1203/* This function synchronize snd mss to current pmtu/exthdr set.
996 1204
@@ -1034,6 +1242,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1034 1242
1035 return mss_now; 1243 return mss_now;
1036} 1244}
1245EXPORT_SYMBOL(tcp_sync_mss);
1037 1246
1038/* Compute the current effective MSS, taking SACKs and IP options, 1247/* Compute the current effective MSS, taking SACKs and IP options,
1039 * and even PMTU discovery events into account. 1248 * and even PMTU discovery events into account.
@@ -1130,8 +1339,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
1130 u32 in_flight, cwnd; 1339 u32 in_flight, cwnd;
1131 1340
1132 /* Don't be strict about the congestion window for the final FIN. */ 1341 /* Don't be strict about the congestion window for the final FIN. */
1133 if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && 1342 if ((TCP_SKB_CB(skb)->flags & TCPHDR_FIN) && tcp_skb_pcount(skb) == 1)
1134 tcp_skb_pcount(skb) == 1)
1135 return 1; 1343 return 1;
1136 1344
1137 in_flight = tcp_packets_in_flight(tp); 1345 in_flight = tcp_packets_in_flight(tp);
@@ -1142,7 +1350,8 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp,
1142 return 0; 1350 return 0;
1143} 1351}
1144 1352
1145/* This must be invoked the first time we consider transmitting 1353/* Intialize TSO state of a skb.
1354 * This must be invoked the first time we consider transmitting
1146 * SKB onto the wire. 1355 * SKB onto the wire.
1147 */ 1356 */
1148static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, 1357static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,
@@ -1157,6 +1366,7 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,
1157 return tso_segs; 1366 return tso_segs;
1158} 1367}
1159 1368
1369/* Minshall's variant of the Nagle send check. */
1160static inline int tcp_minshall_check(const struct tcp_sock *tp) 1370static inline int tcp_minshall_check(const struct tcp_sock *tp)
1161{ 1371{
1162 return after(tp->snd_sml, tp->snd_una) && 1372 return after(tp->snd_sml, tp->snd_una) &&
@@ -1174,9 +1384,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp,
1174 const struct sk_buff *skb, 1384 const struct sk_buff *skb,
1175 unsigned mss_now, int nonagle) 1385 unsigned mss_now, int nonagle)
1176{ 1386{
1177 return (skb->len < mss_now && 1387 return skb->len < mss_now &&
1178 ((nonagle & TCP_NAGLE_CORK) || 1388 ((nonagle & TCP_NAGLE_CORK) ||
1179 (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); 1389 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1180} 1390}
1181 1391
1182/* Return non-zero if the Nagle test allows this packet to be 1392/* Return non-zero if the Nagle test allows this packet to be
@@ -1198,7 +1408,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
1198 * Nagle can be ignored during F-RTO too (see RFC4138). 1408 * Nagle can be ignored during F-RTO too (see RFC4138).
1199 */ 1409 */
1200 if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || 1410 if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
1201 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) 1411 (TCP_SKB_CB(skb)->flags & TCPHDR_FIN))
1202 return 1; 1412 return 1;
1203 1413
1204 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle)) 1414 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
@@ -1241,15 +1451,16 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
1241 return cwnd_quota; 1451 return cwnd_quota;
1242} 1452}
1243 1453
1454/* Test if sending is allowed right now. */
1244int tcp_may_send_now(struct sock *sk) 1455int tcp_may_send_now(struct sock *sk)
1245{ 1456{
1246 struct tcp_sock *tp = tcp_sk(sk); 1457 struct tcp_sock *tp = tcp_sk(sk);
1247 struct sk_buff *skb = tcp_send_head(sk); 1458 struct sk_buff *skb = tcp_send_head(sk);
1248 1459
1249 return (skb && 1460 return skb &&
1250 tcp_snd_test(sk, skb, tcp_current_mss(sk), 1461 tcp_snd_test(sk, skb, tcp_current_mss(sk),
1251 (tcp_skb_is_last(sk, skb) ? 1462 (tcp_skb_is_last(sk, skb) ?
1252 tp->nonagle : TCP_NAGLE_PUSH))); 1463 tp->nonagle : TCP_NAGLE_PUSH));
1253} 1464}
1254 1465
1255/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet 1466/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
@@ -1260,7 +1471,7 @@ int tcp_may_send_now(struct sock *sk)
1260 * packet has never been sent out before (and thus is not cloned). 1471 * packet has never been sent out before (and thus is not cloned).
1261 */ 1472 */
1262static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, 1473static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1263 unsigned int mss_now) 1474 unsigned int mss_now, gfp_t gfp)
1264{ 1475{
1265 struct sk_buff *buff; 1476 struct sk_buff *buff;
1266 int nlen = skb->len - len; 1477 int nlen = skb->len - len;
@@ -1270,7 +1481,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1270 if (skb->len != skb->data_len) 1481 if (skb->len != skb->data_len)
1271 return tcp_fragment(sk, skb, len, mss_now); 1482 return tcp_fragment(sk, skb, len, mss_now);
1272 1483
1273 buff = sk_stream_alloc_skb(sk, 0, GFP_ATOMIC); 1484 buff = sk_stream_alloc_skb(sk, 0, gfp);
1274 if (unlikely(buff == NULL)) 1485 if (unlikely(buff == NULL))
1275 return -ENOMEM; 1486 return -ENOMEM;
1276 1487
@@ -1286,7 +1497,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1286 1497
1287 /* PSH and FIN should only be set in the second packet. */ 1498 /* PSH and FIN should only be set in the second packet. */
1288 flags = TCP_SKB_CB(skb)->flags; 1499 flags = TCP_SKB_CB(skb)->flags;
1289 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); 1500 TCP_SKB_CB(skb)->flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
1290 TCP_SKB_CB(buff)->flags = flags; 1501 TCP_SKB_CB(buff)->flags = flags;
1291 1502
1292 /* This packet was never sent out yet, so no SACK bits. */ 1503 /* This packet was never sent out yet, so no SACK bits. */
@@ -1316,8 +1527,9 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1316 struct tcp_sock *tp = tcp_sk(sk); 1527 struct tcp_sock *tp = tcp_sk(sk);
1317 const struct inet_connection_sock *icsk = inet_csk(sk); 1528 const struct inet_connection_sock *icsk = inet_csk(sk);
1318 u32 send_win, cong_win, limit, in_flight; 1529 u32 send_win, cong_win, limit, in_flight;
1530 int win_divisor;
1319 1531
1320 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) 1532 if (TCP_SKB_CB(skb)->flags & TCPHDR_FIN)
1321 goto send_now; 1533 goto send_now;
1322 1534
1323 if (icsk->icsk_ca_state != TCP_CA_Open) 1535 if (icsk->icsk_ca_state != TCP_CA_Open)
@@ -1347,13 +1559,14 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1347 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len)) 1559 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
1348 goto send_now; 1560 goto send_now;
1349 1561
1350 if (sysctl_tcp_tso_win_divisor) { 1562 win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
1563 if (win_divisor) {
1351 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); 1564 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
1352 1565
1353 /* If at least some fraction of a window is available, 1566 /* If at least some fraction of a window is available,
1354 * just use it. 1567 * just use it.
1355 */ 1568 */
1356 chunk /= sysctl_tcp_tso_win_divisor; 1569 chunk /= win_divisor;
1357 if (limit >= chunk) 1570 if (limit >= chunk)
1358 goto send_now; 1571 goto send_now;
1359 } else { 1572 } else {
@@ -1377,6 +1590,10 @@ send_now:
1377} 1590}
1378 1591
1379/* Create a new MTU probe if we are ready. 1592/* Create a new MTU probe if we are ready.
1593 * MTU probe is regularly attempting to increase the path MTU by
1594 * deliberately sending larger packets. This discovers routing
1595 * changes resulting in larger path MTUs.
1596 *
1380 * Returns 0 if we should wait to probe (no cwnd available), 1597 * Returns 0 if we should wait to probe (no cwnd available),
1381 * 1 if a probe was sent, 1598 * 1 if a probe was sent,
1382 * -1 otherwise 1599 * -1 otherwise
@@ -1439,7 +1656,7 @@ static int tcp_mtu_probe(struct sock *sk)
1439 1656
1440 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; 1657 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
1441 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; 1658 TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
1442 TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; 1659 TCP_SKB_CB(nskb)->flags = TCPHDR_ACK;
1443 TCP_SKB_CB(nskb)->sacked = 0; 1660 TCP_SKB_CB(nskb)->sacked = 0;
1444 nskb->csum = 0; 1661 nskb->csum = 0;
1445 nskb->ip_summed = skb->ip_summed; 1662 nskb->ip_summed = skb->ip_summed;
@@ -1464,7 +1681,7 @@ static int tcp_mtu_probe(struct sock *sk)
1464 sk_wmem_free_skb(sk, skb); 1681 sk_wmem_free_skb(sk, skb);
1465 } else { 1682 } else {
1466 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & 1683 TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
1467 ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); 1684 ~(TCPHDR_FIN|TCPHDR_PSH);
1468 if (!skb_shinfo(skb)->nr_frags) { 1685 if (!skb_shinfo(skb)->nr_frags) {
1469 skb_pull(skb, copy); 1686 skb_pull(skb, copy);
1470 if (skb->ip_summed != CHECKSUM_PARTIAL) 1687 if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -1564,7 +1781,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1564 cwnd_quota); 1781 cwnd_quota);
1565 1782
1566 if (skb->len > limit && 1783 if (skb->len > limit &&
1567 unlikely(tso_fragment(sk, skb, limit, mss_now))) 1784 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
1568 break; 1785 break;
1569 1786
1570 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1787 TCP_SKB_CB(skb)->when = tcp_time_stamp;
@@ -1598,11 +1815,6 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1598void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, 1815void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
1599 int nonagle) 1816 int nonagle)
1600{ 1817{
1601 struct sk_buff *skb = tcp_send_head(sk);
1602
1603 if (!skb)
1604 return;
1605
1606 /* If we are closed, the bytes will have to remain here. 1818 /* If we are closed, the bytes will have to remain here.
1607 * In time closedown will finish, we empty the write queue and 1819 * In time closedown will finish, we empty the write queue and
1608 * all will be happy. 1820 * all will be happy.
@@ -1789,6 +2001,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
1789 sk_wmem_free_skb(sk, next_skb); 2001 sk_wmem_free_skb(sk, next_skb);
1790} 2002}
1791 2003
2004/* Check if coalescing SKBs is legal. */
1792static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) 2005static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb)
1793{ 2006{
1794 if (tcp_skb_pcount(skb) > 1) 2007 if (tcp_skb_pcount(skb) > 1)
@@ -1807,6 +2020,9 @@ static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb)
1807 return 1; 2020 return 1;
1808} 2021}
1809 2022
2023/* Collapse packets in the retransmit queue to make to create
2024 * less packets on the wire. This is only done on retransmission.
2025 */
1810static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, 2026static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
1811 int space) 2027 int space)
1812{ 2028{
@@ -1816,7 +2032,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
1816 2032
1817 if (!sysctl_tcp_retrans_collapse) 2033 if (!sysctl_tcp_retrans_collapse)
1818 return; 2034 return;
1819 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) 2035 if (TCP_SKB_CB(skb)->flags & TCPHDR_SYN)
1820 return; 2036 return;
1821 2037
1822 tcp_for_write_queue_from_safe(skb, tmp, sk) { 2038 tcp_for_write_queue_from_safe(skb, tmp, sk) {
@@ -1885,8 +2101,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1885 * case, when window is shrunk to zero. In this case 2101 * case, when window is shrunk to zero. In this case
1886 * our retransmit serves as a zero window probe. 2102 * our retransmit serves as a zero window probe.
1887 */ 2103 */
1888 if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) 2104 if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
1889 && TCP_SKB_CB(skb)->seq != tp->snd_una) 2105 TCP_SKB_CB(skb)->seq != tp->snd_una)
1890 return -EAGAIN; 2106 return -EAGAIN;
1891 2107
1892 if (skb->len > cur_mss) { 2108 if (skb->len > cur_mss) {
@@ -1908,7 +2124,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1908 * since it is cheap to do so and saves bytes on the network. 2124 * since it is cheap to do so and saves bytes on the network.
1909 */ 2125 */
1910 if (skb->len > 0 && 2126 if (skb->len > 0 &&
1911 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && 2127 (TCP_SKB_CB(skb)->flags & TCPHDR_FIN) &&
1912 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { 2128 tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
1913 if (!pskb_trim(skb, 0)) { 2129 if (!pskb_trim(skb, 0)) {
1914 /* Reuse, even though it does some unnecessary work */ 2130 /* Reuse, even though it does some unnecessary work */
@@ -1956,6 +2172,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1956 return err; 2172 return err;
1957} 2173}
1958 2174
2175/* Check if we forward retransmits are possible in the current
2176 * window/congestion state.
2177 */
1959static int tcp_can_forward_retransmit(struct sock *sk) 2178static int tcp_can_forward_retransmit(struct sock *sk)
1960{ 2179{
1961 const struct inet_connection_sock *icsk = inet_csk(sk); 2180 const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2001,6 +2220,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2001 int mib_idx; 2220 int mib_idx;
2002 int fwd_rexmitting = 0; 2221 int fwd_rexmitting = 0;
2003 2222
2223 if (!tp->packets_out)
2224 return;
2225
2004 if (!tp->lost_out) 2226 if (!tp->lost_out)
2005 tp->retransmit_high = tp->snd_una; 2227 tp->retransmit_high = tp->snd_una;
2006 2228
@@ -2094,13 +2316,14 @@ void tcp_send_fin(struct sock *sk)
2094 mss_now = tcp_current_mss(sk); 2316 mss_now = tcp_current_mss(sk);
2095 2317
2096 if (tcp_send_head(sk) != NULL) { 2318 if (tcp_send_head(sk) != NULL) {
2097 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; 2319 TCP_SKB_CB(skb)->flags |= TCPHDR_FIN;
2098 TCP_SKB_CB(skb)->end_seq++; 2320 TCP_SKB_CB(skb)->end_seq++;
2099 tp->write_seq++; 2321 tp->write_seq++;
2100 } else { 2322 } else {
2101 /* Socket is locked, keep trying until memory is available. */ 2323 /* Socket is locked, keep trying until memory is available. */
2102 for (;;) { 2324 for (;;) {
2103 skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL); 2325 skb = alloc_skb_fclone(MAX_TCP_HEADER,
2326 sk->sk_allocation);
2104 if (skb) 2327 if (skb)
2105 break; 2328 break;
2106 yield(); 2329 yield();
@@ -2110,7 +2333,7 @@ void tcp_send_fin(struct sock *sk)
2110 skb_reserve(skb, MAX_TCP_HEADER); 2333 skb_reserve(skb, MAX_TCP_HEADER);
2111 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ 2334 /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
2112 tcp_init_nondata_skb(skb, tp->write_seq, 2335 tcp_init_nondata_skb(skb, tp->write_seq,
2113 TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); 2336 TCPHDR_ACK | TCPHDR_FIN);
2114 tcp_queue_skb(sk, skb); 2337 tcp_queue_skb(sk, skb);
2115 } 2338 }
2116 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); 2339 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
@@ -2135,7 +2358,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2135 /* Reserve space for headers and prepare control bits. */ 2358 /* Reserve space for headers and prepare control bits. */
2136 skb_reserve(skb, MAX_TCP_HEADER); 2359 skb_reserve(skb, MAX_TCP_HEADER);
2137 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), 2360 tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
2138 TCPCB_FLAG_ACK | TCPCB_FLAG_RST); 2361 TCPHDR_ACK | TCPHDR_RST);
2139 /* Send it off. */ 2362 /* Send it off. */
2140 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2363 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2141 if (tcp_transmit_skb(sk, skb, 0, priority)) 2364 if (tcp_transmit_skb(sk, skb, 0, priority))
@@ -2144,7 +2367,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
2144 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); 2367 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
2145} 2368}
2146 2369
2147/* WARNING: This routine must only be called when we have already sent 2370/* Send a crossed SYN-ACK during socket establishment.
2371 * WARNING: This routine must only be called when we have already sent
2148 * a SYN packet that crossed the incoming SYN that caused this routine 2372 * a SYN packet that crossed the incoming SYN that caused this routine
2149 * to get called. If this assumption fails then the initial rcv_wnd 2373 * to get called. If this assumption fails then the initial rcv_wnd
2150 * and rcv_wscale values will not be correct. 2374 * and rcv_wscale values will not be correct.
@@ -2154,11 +2378,11 @@ int tcp_send_synack(struct sock *sk)
2154 struct sk_buff *skb; 2378 struct sk_buff *skb;
2155 2379
2156 skb = tcp_write_queue_head(sk); 2380 skb = tcp_write_queue_head(sk);
2157 if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) { 2381 if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPHDR_SYN)) {
2158 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); 2382 printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
2159 return -EFAULT; 2383 return -EFAULT;
2160 } 2384 }
2161 if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) { 2385 if (!(TCP_SKB_CB(skb)->flags & TCPHDR_ACK)) {
2162 if (skb_cloned(skb)) { 2386 if (skb_cloned(skb)) {
2163 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); 2387 struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
2164 if (nskb == NULL) 2388 if (nskb == NULL)
@@ -2172,30 +2396,33 @@ int tcp_send_synack(struct sock *sk)
2172 skb = nskb; 2396 skb = nskb;
2173 } 2397 }
2174 2398
2175 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK; 2399 TCP_SKB_CB(skb)->flags |= TCPHDR_ACK;
2176 TCP_ECN_send_synack(tcp_sk(sk), skb); 2400 TCP_ECN_send_synack(tcp_sk(sk), skb);
2177 } 2401 }
2178 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2402 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2179 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2403 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2180} 2404}
2181 2405
2182/* 2406/* Prepare a SYN-ACK. */
2183 * Prepare a SYN-ACK.
2184 */
2185struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, 2407struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2186 struct request_sock *req) 2408 struct request_sock *req,
2409 struct request_values *rvp)
2187{ 2410{
2411 struct tcp_out_options opts;
2412 struct tcp_extend_values *xvp = tcp_xv(rvp);
2188 struct inet_request_sock *ireq = inet_rsk(req); 2413 struct inet_request_sock *ireq = inet_rsk(req);
2189 struct tcp_sock *tp = tcp_sk(sk); 2414 struct tcp_sock *tp = tcp_sk(sk);
2415 const struct tcp_cookie_values *cvp = tp->cookie_values;
2190 struct tcphdr *th; 2416 struct tcphdr *th;
2191 int tcp_header_size;
2192 struct tcp_out_options opts;
2193 struct sk_buff *skb; 2417 struct sk_buff *skb;
2194 struct tcp_md5sig_key *md5; 2418 struct tcp_md5sig_key *md5;
2195 __u8 *md5_hash_location; 2419 int tcp_header_size;
2196 int mss; 2420 int mss;
2421 int s_data_desired = 0;
2197 2422
2198 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); 2423 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
2424 s_data_desired = cvp->s_data_desired;
2425 skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
2199 if (skb == NULL) 2426 if (skb == NULL)
2200 return NULL; 2427 return NULL;
2201 2428
@@ -2204,7 +2431,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2204 2431
2205 skb_dst_set(skb, dst_clone(dst)); 2432 skb_dst_set(skb, dst_clone(dst));
2206 2433
2207 mss = dst_metric(dst, RTAX_ADVMSS); 2434 mss = dst_metric_advmss(dst);
2208 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) 2435 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
2209 mss = tp->rx_opt.user_mss; 2436 mss = tp->rx_opt.user_mss;
2210 2437
@@ -2212,13 +2439,20 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2212 __u8 rcv_wscale; 2439 __u8 rcv_wscale;
2213 /* Set this up on the first call only */ 2440 /* Set this up on the first call only */
2214 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); 2441 req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
2442
2443 /* limit the window selection if the user enforce a smaller rx buffer */
2444 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2445 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
2446 req->window_clamp = tcp_full_space(sk);
2447
2215 /* tcp_full_space because it is guaranteed to be the first packet */ 2448 /* tcp_full_space because it is guaranteed to be the first packet */
2216 tcp_select_initial_window(tcp_full_space(sk), 2449 tcp_select_initial_window(tcp_full_space(sk),
2217 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), 2450 mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
2218 &req->rcv_wnd, 2451 &req->rcv_wnd,
2219 &req->window_clamp, 2452 &req->window_clamp,
2220 ireq->wscale_ok, 2453 ireq->wscale_ok,
2221 &rcv_wscale); 2454 &rcv_wscale,
2455 dst_metric(dst, RTAX_INITRWND));
2222 ireq->rcv_wscale = rcv_wscale; 2456 ireq->rcv_wscale = rcv_wscale;
2223 } 2457 }
2224 2458
@@ -2230,8 +2464,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2230#endif 2464#endif
2231 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2465 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2232 tcp_header_size = tcp_synack_options(sk, req, mss, 2466 tcp_header_size = tcp_synack_options(sk, req, mss,
2233 skb, &opts, &md5) + 2467 skb, &opts, &md5, xvp)
2234 sizeof(struct tcphdr); 2468 + sizeof(*th);
2235 2469
2236 skb_push(skb, tcp_header_size); 2470 skb_push(skb, tcp_header_size);
2237 skb_reset_transport_header(skb); 2471 skb_reset_transport_header(skb);
@@ -2247,30 +2481,64 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2247 * not even correctly set) 2481 * not even correctly set)
2248 */ 2482 */
2249 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, 2483 tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
2250 TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); 2484 TCPHDR_SYN | TCPHDR_ACK);
2485
2486 if (OPTION_COOKIE_EXTENSION & opts.options) {
2487 if (s_data_desired) {
2488 u8 *buf = skb_put(skb, s_data_desired);
2489
2490 /* copy data directly from the listening socket. */
2491 memcpy(buf, cvp->s_data_payload, s_data_desired);
2492 TCP_SKB_CB(skb)->end_seq += s_data_desired;
2493 }
2494
2495 if (opts.hash_size > 0) {
2496 __u32 workspace[SHA_WORKSPACE_WORDS];
2497 u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
2498 u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];
2499
2500 /* Secret recipe depends on the Timestamp, (future)
2501 * Sequence and Acknowledgment Numbers, Initiator
2502 * Cookie, and others handled by IP variant caller.
2503 */
2504 *tail-- ^= opts.tsval;
2505 *tail-- ^= tcp_rsk(req)->rcv_isn + 1;
2506 *tail-- ^= TCP_SKB_CB(skb)->seq + 1;
2507
2508 /* recommended */
2509 *tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source);
2510 *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
2511
2512 sha_transform((__u32 *)&xvp->cookie_bakery[0],
2513 (char *)mess,
2514 &workspace[0]);
2515 opts.hash_location =
2516 (__u8 *)&xvp->cookie_bakery[0];
2517 }
2518 }
2519
2251 th->seq = htonl(TCP_SKB_CB(skb)->seq); 2520 th->seq = htonl(TCP_SKB_CB(skb)->seq);
2252 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); 2521 th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
2253 2522
2254 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ 2523 /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
2255 th->window = htons(min(req->rcv_wnd, 65535U)); 2524 th->window = htons(min(req->rcv_wnd, 65535U));
2256 tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); 2525 tcp_options_write((__be32 *)(th + 1), tp, &opts);
2257 th->doff = (tcp_header_size >> 2); 2526 th->doff = (tcp_header_size >> 2);
2258 TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); 2527 TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
2259 2528
2260#ifdef CONFIG_TCP_MD5SIG 2529#ifdef CONFIG_TCP_MD5SIG
2261 /* Okay, we have all we need - do the md5 hash if needed */ 2530 /* Okay, we have all we need - do the md5 hash if needed */
2262 if (md5) { 2531 if (md5) {
2263 tp->af_specific->calc_md5_hash(md5_hash_location, 2532 tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
2264 md5, NULL, req, skb); 2533 md5, NULL, req, skb);
2265 } 2534 }
2266#endif 2535#endif
2267 2536
2268 return skb; 2537 return skb;
2269} 2538}
2539EXPORT_SYMBOL(tcp_make_synack);
2270 2540
2271/* 2541/* Do all connect socket setups that can be done AF independent. */
2272 * Do all connect socket setups that can be done AF independent.
2273 */
2274static void tcp_connect_init(struct sock *sk) 2542static void tcp_connect_init(struct sock *sk)
2275{ 2543{
2276 struct dst_entry *dst = __sk_dst_get(sk); 2544 struct dst_entry *dst = __sk_dst_get(sk);
@@ -2297,18 +2565,24 @@ static void tcp_connect_init(struct sock *sk)
2297 2565
2298 if (!tp->window_clamp) 2566 if (!tp->window_clamp)
2299 tp->window_clamp = dst_metric(dst, RTAX_WINDOW); 2567 tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
2300 tp->advmss = dst_metric(dst, RTAX_ADVMSS); 2568 tp->advmss = dst_metric_advmss(dst);
2301 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) 2569 if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss)
2302 tp->advmss = tp->rx_opt.user_mss; 2570 tp->advmss = tp->rx_opt.user_mss;
2303 2571
2304 tcp_initialize_rcv_mss(sk); 2572 tcp_initialize_rcv_mss(sk);
2305 2573
2574 /* limit the window selection if the user enforce a smaller rx buffer */
2575 if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
2576 (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
2577 tp->window_clamp = tcp_full_space(sk);
2578
2306 tcp_select_initial_window(tcp_full_space(sk), 2579 tcp_select_initial_window(tcp_full_space(sk),
2307 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), 2580 tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
2308 &tp->rcv_wnd, 2581 &tp->rcv_wnd,
2309 &tp->window_clamp, 2582 &tp->window_clamp,
2310 sysctl_tcp_window_scaling, 2583 sysctl_tcp_window_scaling,
2311 &rcv_wscale); 2584 &rcv_wscale,
2585 dst_metric(dst, RTAX_INITRWND));
2312 2586
2313 tp->rx_opt.rcv_wscale = rcv_wscale; 2587 tp->rx_opt.rcv_wscale = rcv_wscale;
2314 tp->rcv_ssthresh = tp->rcv_wnd; 2588 tp->rcv_ssthresh = tp->rcv_wnd;
@@ -2329,13 +2603,12 @@ static void tcp_connect_init(struct sock *sk)
2329 tcp_clear_retrans(tp); 2603 tcp_clear_retrans(tp);
2330} 2604}
2331 2605
2332/* 2606/* Build a SYN and send it off. */
2333 * Build a SYN and send it off.
2334 */
2335int tcp_connect(struct sock *sk) 2607int tcp_connect(struct sock *sk)
2336{ 2608{
2337 struct tcp_sock *tp = tcp_sk(sk); 2609 struct tcp_sock *tp = tcp_sk(sk);
2338 struct sk_buff *buff; 2610 struct sk_buff *buff;
2611 int err;
2339 2612
2340 tcp_connect_init(sk); 2613 tcp_connect_init(sk);
2341 2614
@@ -2347,7 +2620,7 @@ int tcp_connect(struct sock *sk)
2347 skb_reserve(buff, MAX_TCP_HEADER); 2620 skb_reserve(buff, MAX_TCP_HEADER);
2348 2621
2349 tp->snd_nxt = tp->write_seq; 2622 tp->snd_nxt = tp->write_seq;
2350 tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN); 2623 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
2351 TCP_ECN_send_syn(sk, buff); 2624 TCP_ECN_send_syn(sk, buff);
2352 2625
2353 /* Send it off. */ 2626 /* Send it off. */
@@ -2358,7 +2631,9 @@ int tcp_connect(struct sock *sk)
2358 sk->sk_wmem_queued += buff->truesize; 2631 sk->sk_wmem_queued += buff->truesize;
2359 sk_mem_charge(sk, buff->truesize); 2632 sk_mem_charge(sk, buff->truesize);
2360 tp->packets_out += tcp_skb_pcount(buff); 2633 tp->packets_out += tcp_skb_pcount(buff);
2361 tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); 2634 err = tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
2635 if (err == -ECONNREFUSED)
2636 return err;
2362 2637
2363 /* We change tp->snd_nxt after the tcp_transmit_skb() call 2638 /* We change tp->snd_nxt after the tcp_transmit_skb() call
2364 * in order to make this packet get counted in tcpOutSegs. 2639 * in order to make this packet get counted in tcpOutSegs.
@@ -2372,6 +2647,7 @@ int tcp_connect(struct sock *sk)
2372 inet_csk(sk)->icsk_rto, TCP_RTO_MAX); 2647 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
2373 return 0; 2648 return 0;
2374} 2649}
2650EXPORT_SYMBOL(tcp_connect);
2375 2651
2376/* Send out a delayed ack, the caller does the policy checking 2652/* Send out a delayed ack, the caller does the policy checking
2377 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() 2653 * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
@@ -2453,7 +2729,7 @@ void tcp_send_ack(struct sock *sk)
2453 2729
2454 /* Reserve space for headers and prepare control bits. */ 2730 /* Reserve space for headers and prepare control bits. */
2455 skb_reserve(buff, MAX_TCP_HEADER); 2731 skb_reserve(buff, MAX_TCP_HEADER);
2456 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK); 2732 tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
2457 2733
2458 /* Send it off, this clears delayed acks for us. */ 2734 /* Send it off, this clears delayed acks for us. */
2459 TCP_SKB_CB(buff)->when = tcp_time_stamp; 2735 TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -2487,11 +2763,12 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
2487 * end to send an ack. Don't queue or clone SKB, just 2763 * end to send an ack. Don't queue or clone SKB, just
2488 * send it. 2764 * send it.
2489 */ 2765 */
2490 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK); 2766 tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
2491 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2767 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2492 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); 2768 return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
2493} 2769}
2494 2770
2771/* Initiate keepalive or window probe from timer. */
2495int tcp_write_wakeup(struct sock *sk) 2772int tcp_write_wakeup(struct sock *sk)
2496{ 2773{
2497 struct tcp_sock *tp = tcp_sk(sk); 2774 struct tcp_sock *tp = tcp_sk(sk);
@@ -2516,13 +2793,13 @@ int tcp_write_wakeup(struct sock *sk)
2516 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || 2793 if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
2517 skb->len > mss) { 2794 skb->len > mss) {
2518 seg_size = min(seg_size, mss); 2795 seg_size = min(seg_size, mss);
2519 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 2796 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
2520 if (tcp_fragment(sk, skb, seg_size, mss)) 2797 if (tcp_fragment(sk, skb, seg_size, mss))
2521 return -1; 2798 return -1;
2522 } else if (!tcp_skb_pcount(skb)) 2799 } else if (!tcp_skb_pcount(skb))
2523 tcp_set_skb_tso_segs(sk, skb, mss); 2800 tcp_set_skb_tso_segs(sk, skb, mss);
2524 2801
2525 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 2802 TCP_SKB_CB(skb)->flags |= TCPHDR_PSH;
2526 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2803 TCP_SKB_CB(skb)->when = tcp_time_stamp;
2527 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); 2804 err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2528 if (!err) 2805 if (!err)
@@ -2575,10 +2852,3 @@ void tcp_send_probe0(struct sock *sk)
2575 TCP_RTO_MAX); 2852 TCP_RTO_MAX);
2576 } 2853 }
2577} 2854}
2578
2579EXPORT_SYMBOL(tcp_select_initial_window);
2580EXPORT_SYMBOL(tcp_connect);
2581EXPORT_SYMBOL(tcp_make_synack);
2582EXPORT_SYMBOL(tcp_simple_retransmit);
2583EXPORT_SYMBOL(tcp_sync_mss);
2584EXPORT_SYMBOL(tcp_mtup_init);