diff options
| author | Ingo Molnar <mingo@elte.hu> | 2008-10-28 11:26:12 -0400 |
|---|---|---|
| committer | Ingo Molnar <mingo@elte.hu> | 2008-10-28 11:26:12 -0400 |
| commit | 7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch) | |
| tree | e730a4565e0318140d2fbd2f0415d18a339d7336 /net/ipv4/tcp_output.c | |
| parent | 41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff) | |
| parent | 0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff) | |
Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks
Diffstat (limited to 'net/ipv4/tcp_output.c')
| -rw-r--r-- | net/ipv4/tcp_output.c | 683 |
1 files changed, 368 insertions, 315 deletions
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ad993ecb4810..e4c5ac9fe89b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
| @@ -5,8 +5,6 @@ | |||
| 5 | * | 5 | * |
| 6 | * Implementation of the Transmission Control Protocol(TCP). | 6 | * Implementation of the Transmission Control Protocol(TCP). |
| 7 | * | 7 | * |
| 8 | * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $ | ||
| 9 | * | ||
| 10 | * Authors: Ross Biro | 8 | * Authors: Ross Biro |
| 11 | * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> | 9 | * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> |
| 12 | * Mark Evans, <evansmp@uhura.aston.ac.uk> | 10 | * Mark Evans, <evansmp@uhura.aston.ac.uk> |
| @@ -347,112 +345,240 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) | |||
| 347 | TCP_SKB_CB(skb)->end_seq = seq; | 345 | TCP_SKB_CB(skb)->end_seq = seq; |
| 348 | } | 346 | } |
| 349 | 347 | ||
| 350 | static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp, | 348 | static inline int tcp_urg_mode(const struct tcp_sock *tp) |
| 351 | __u32 tstamp, __u8 **md5_hash) | ||
| 352 | { | 349 | { |
| 353 | if (tp->rx_opt.tstamp_ok) { | 350 | return tp->snd_una != tp->snd_up; |
| 354 | *ptr++ = htonl((TCPOPT_NOP << 24) | | 351 | } |
| 355 | (TCPOPT_NOP << 16) | | ||
| 356 | (TCPOPT_TIMESTAMP << 8) | | ||
| 357 | TCPOLEN_TIMESTAMP); | ||
| 358 | *ptr++ = htonl(tstamp); | ||
| 359 | *ptr++ = htonl(tp->rx_opt.ts_recent); | ||
| 360 | } | ||
| 361 | if (tp->rx_opt.eff_sacks) { | ||
| 362 | struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; | ||
| 363 | int this_sack; | ||
| 364 | |||
| 365 | *ptr++ = htonl((TCPOPT_NOP << 24) | | ||
| 366 | (TCPOPT_NOP << 16) | | ||
| 367 | (TCPOPT_SACK << 8) | | ||
| 368 | (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * | ||
| 369 | TCPOLEN_SACK_PERBLOCK))); | ||
| 370 | |||
| 371 | for (this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { | ||
| 372 | *ptr++ = htonl(sp[this_sack].start_seq); | ||
| 373 | *ptr++ = htonl(sp[this_sack].end_seq); | ||
| 374 | } | ||
| 375 | 352 | ||
| 376 | if (tp->rx_opt.dsack) { | 353 | #define OPTION_SACK_ADVERTISE (1 << 0) |
| 377 | tp->rx_opt.dsack = 0; | 354 | #define OPTION_TS (1 << 1) |
| 378 | tp->rx_opt.eff_sacks--; | 355 | #define OPTION_MD5 (1 << 2) |
| 379 | } | 356 | |
| 380 | } | 357 | struct tcp_out_options { |
| 381 | #ifdef CONFIG_TCP_MD5SIG | 358 | u8 options; /* bit field of OPTION_* */ |
| 382 | if (md5_hash) { | 359 | u8 ws; /* window scale, 0 to disable */ |
| 360 | u8 num_sack_blocks; /* number of SACK blocks to include */ | ||
| 361 | u16 mss; /* 0 to disable */ | ||
| 362 | __u32 tsval, tsecr; /* need to include OPTION_TS */ | ||
| 363 | }; | ||
| 364 | |||
| 365 | /* Beware: Something in the Internet is very sensitive to the ordering of | ||
| 366 | * TCP options, we learned this through the hard way, so be careful here. | ||
| 367 | * Luckily we can at least blame others for their non-compliance but from | ||
| 368 | * inter-operatibility perspective it seems that we're somewhat stuck with | ||
| 369 | * the ordering which we have been using if we want to keep working with | ||
| 370 | * those broken things (not that it currently hurts anybody as there isn't | ||
| 371 | * particular reason why the ordering would need to be changed). | ||
| 372 | * | ||
| 373 | * At least SACK_PERM as the first option is known to lead to a disaster | ||
| 374 | * (but it may well be that other scenarios fail similarly). | ||
| 375 | */ | ||
| 376 | static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, | ||
| 377 | const struct tcp_out_options *opts, | ||
| 378 | __u8 **md5_hash) { | ||
| 379 | if (unlikely(OPTION_MD5 & opts->options)) { | ||
| 383 | *ptr++ = htonl((TCPOPT_NOP << 24) | | 380 | *ptr++ = htonl((TCPOPT_NOP << 24) | |
| 384 | (TCPOPT_NOP << 16) | | 381 | (TCPOPT_NOP << 16) | |
| 385 | (TCPOPT_MD5SIG << 8) | | 382 | (TCPOPT_MD5SIG << 8) | |
| 386 | TCPOLEN_MD5SIG); | 383 | TCPOLEN_MD5SIG); |
| 387 | *md5_hash = (__u8 *)ptr; | 384 | *md5_hash = (__u8 *)ptr; |
| 385 | ptr += 4; | ||
| 386 | } else { | ||
| 387 | *md5_hash = NULL; | ||
| 388 | } | 388 | } |
| 389 | #endif | ||
| 390 | } | ||
| 391 | 389 | ||
| 392 | /* Construct a tcp options header for a SYN or SYN_ACK packet. | 390 | if (unlikely(opts->mss)) { |
| 393 | * If this is every changed make sure to change the definition of | 391 | *ptr++ = htonl((TCPOPT_MSS << 24) | |
| 394 | * MAX_SYN_SIZE to match the new maximum number of options that you | 392 | (TCPOLEN_MSS << 16) | |
| 395 | * can generate. | 393 | opts->mss); |
| 396 | * | 394 | } |
| 397 | * Note - that with the RFC2385 TCP option, we make room for the | 395 | |
| 398 | * 16 byte MD5 hash. This will be filled in later, so the pointer for the | 396 | if (likely(OPTION_TS & opts->options)) { |
| 399 | * location to be filled is passed back up. | 397 | if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) { |
| 400 | */ | ||
| 401 | static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, | ||
| 402 | int offer_wscale, int wscale, __u32 tstamp, | ||
| 403 | __u32 ts_recent, __u8 **md5_hash) | ||
| 404 | { | ||
| 405 | /* We always get an MSS option. | ||
| 406 | * The option bytes which will be seen in normal data | ||
| 407 | * packets should timestamps be used, must be in the MSS | ||
| 408 | * advertised. But we subtract them from tp->mss_cache so | ||
| 409 | * that calculations in tcp_sendmsg are simpler etc. | ||
| 410 | * So account for this fact here if necessary. If we | ||
| 411 | * don't do this correctly, as a receiver we won't | ||
| 412 | * recognize data packets as being full sized when we | ||
| 413 | * should, and thus we won't abide by the delayed ACK | ||
| 414 | * rules correctly. | ||
| 415 | * SACKs don't matter, we never delay an ACK when we | ||
| 416 | * have any of those going out. | ||
| 417 | */ | ||
| 418 | *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); | ||
| 419 | if (ts) { | ||
| 420 | if (sack) | ||
| 421 | *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | | 398 | *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | |
| 422 | (TCPOLEN_SACK_PERM << 16) | | 399 | (TCPOLEN_SACK_PERM << 16) | |
| 423 | (TCPOPT_TIMESTAMP << 8) | | 400 | (TCPOPT_TIMESTAMP << 8) | |
| 424 | TCPOLEN_TIMESTAMP); | 401 | TCPOLEN_TIMESTAMP); |
| 425 | else | 402 | } else { |
| 426 | *ptr++ = htonl((TCPOPT_NOP << 24) | | 403 | *ptr++ = htonl((TCPOPT_NOP << 24) | |
| 427 | (TCPOPT_NOP << 16) | | 404 | (TCPOPT_NOP << 16) | |
| 428 | (TCPOPT_TIMESTAMP << 8) | | 405 | (TCPOPT_TIMESTAMP << 8) | |
| 429 | TCPOLEN_TIMESTAMP); | 406 | TCPOLEN_TIMESTAMP); |
| 430 | *ptr++ = htonl(tstamp); /* TSVAL */ | 407 | } |
| 431 | *ptr++ = htonl(ts_recent); /* TSECR */ | 408 | *ptr++ = htonl(opts->tsval); |
| 432 | } else if (sack) | 409 | *ptr++ = htonl(opts->tsecr); |
| 410 | } | ||
| 411 | |||
| 412 | if (unlikely(OPTION_SACK_ADVERTISE & opts->options && | ||
| 413 | !(OPTION_TS & opts->options))) { | ||
| 433 | *ptr++ = htonl((TCPOPT_NOP << 24) | | 414 | *ptr++ = htonl((TCPOPT_NOP << 24) | |
| 434 | (TCPOPT_NOP << 16) | | 415 | (TCPOPT_NOP << 16) | |
| 435 | (TCPOPT_SACK_PERM << 8) | | 416 | (TCPOPT_SACK_PERM << 8) | |
| 436 | TCPOLEN_SACK_PERM); | 417 | TCPOLEN_SACK_PERM); |
| 437 | if (offer_wscale) | 418 | } |
| 419 | |||
| 420 | if (unlikely(opts->ws)) { | ||
| 438 | *ptr++ = htonl((TCPOPT_NOP << 24) | | 421 | *ptr++ = htonl((TCPOPT_NOP << 24) | |
| 439 | (TCPOPT_WINDOW << 16) | | 422 | (TCPOPT_WINDOW << 16) | |
| 440 | (TCPOLEN_WINDOW << 8) | | 423 | (TCPOLEN_WINDOW << 8) | |
| 441 | (wscale)); | 424 | opts->ws); |
| 425 | } | ||
| 426 | |||
| 427 | if (unlikely(opts->num_sack_blocks)) { | ||
| 428 | struct tcp_sack_block *sp = tp->rx_opt.dsack ? | ||
| 429 | tp->duplicate_sack : tp->selective_acks; | ||
| 430 | int this_sack; | ||
| 431 | |||
| 432 | *ptr++ = htonl((TCPOPT_NOP << 24) | | ||
| 433 | (TCPOPT_NOP << 16) | | ||
| 434 | (TCPOPT_SACK << 8) | | ||
| 435 | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * | ||
| 436 | TCPOLEN_SACK_PERBLOCK))); | ||
| 437 | |||
| 438 | for (this_sack = 0; this_sack < opts->num_sack_blocks; | ||
| 439 | ++this_sack) { | ||
| 440 | *ptr++ = htonl(sp[this_sack].start_seq); | ||
| 441 | *ptr++ = htonl(sp[this_sack].end_seq); | ||
| 442 | } | ||
| 443 | |||
| 444 | if (tp->rx_opt.dsack) { | ||
| 445 | tp->rx_opt.dsack = 0; | ||
| 446 | tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks; | ||
| 447 | } | ||
| 448 | } | ||
| 449 | } | ||
| 450 | |||
| 451 | static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, | ||
| 452 | struct tcp_out_options *opts, | ||
| 453 | struct tcp_md5sig_key **md5) { | ||
| 454 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 455 | unsigned size = 0; | ||
| 456 | |||
| 442 | #ifdef CONFIG_TCP_MD5SIG | 457 | #ifdef CONFIG_TCP_MD5SIG |
| 443 | /* | 458 | *md5 = tp->af_specific->md5_lookup(sk, sk); |
| 444 | * If MD5 is enabled, then we set the option, and include the size | 459 | if (*md5) { |
| 445 | * (always 18). The actual MD5 hash is added just before the | 460 | opts->options |= OPTION_MD5; |
| 446 | * packet is sent. | 461 | size += TCPOLEN_MD5SIG_ALIGNED; |
| 447 | */ | 462 | } |
| 448 | if (md5_hash) { | 463 | #else |
| 449 | *ptr++ = htonl((TCPOPT_NOP << 24) | | 464 | *md5 = NULL; |
| 450 | (TCPOPT_NOP << 16) | | 465 | #endif |
| 451 | (TCPOPT_MD5SIG << 8) | | 466 | |
| 452 | TCPOLEN_MD5SIG); | 467 | /* We always get an MSS option. The option bytes which will be seen in |
| 453 | *md5_hash = (__u8 *)ptr; | 468 | * normal data packets should timestamps be used, must be in the MSS |
| 469 | * advertised. But we subtract them from tp->mss_cache so that | ||
| 470 | * calculations in tcp_sendmsg are simpler etc. So account for this | ||
| 471 | * fact here if necessary. If we don't do this correctly, as a | ||
| 472 | * receiver we won't recognize data packets as being full sized when we | ||
| 473 | * should, and thus we won't abide by the delayed ACK rules correctly. | ||
| 474 | * SACKs don't matter, we never delay an ACK when we have any of those | ||
| 475 | * going out. */ | ||
| 476 | opts->mss = tcp_advertise_mss(sk); | ||
| 477 | size += TCPOLEN_MSS_ALIGNED; | ||
| 478 | |||
| 479 | if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { | ||
| 480 | opts->options |= OPTION_TS; | ||
| 481 | opts->tsval = TCP_SKB_CB(skb)->when; | ||
| 482 | opts->tsecr = tp->rx_opt.ts_recent; | ||
| 483 | size += TCPOLEN_TSTAMP_ALIGNED; | ||
| 484 | } | ||
| 485 | if (likely(sysctl_tcp_window_scaling)) { | ||
| 486 | opts->ws = tp->rx_opt.rcv_wscale; | ||
| 487 | if(likely(opts->ws)) | ||
| 488 | size += TCPOLEN_WSCALE_ALIGNED; | ||
| 489 | } | ||
| 490 | if (likely(sysctl_tcp_sack)) { | ||
| 491 | opts->options |= OPTION_SACK_ADVERTISE; | ||
| 492 | if (unlikely(!(OPTION_TS & opts->options))) | ||
| 493 | size += TCPOLEN_SACKPERM_ALIGNED; | ||
| 494 | } | ||
| 495 | |||
| 496 | return size; | ||
| 497 | } | ||
| 498 | |||
| 499 | static unsigned tcp_synack_options(struct sock *sk, | ||
| 500 | struct request_sock *req, | ||
| 501 | unsigned mss, struct sk_buff *skb, | ||
| 502 | struct tcp_out_options *opts, | ||
| 503 | struct tcp_md5sig_key **md5) { | ||
| 504 | unsigned size = 0; | ||
| 505 | struct inet_request_sock *ireq = inet_rsk(req); | ||
| 506 | char doing_ts; | ||
| 507 | |||
| 508 | #ifdef CONFIG_TCP_MD5SIG | ||
| 509 | *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); | ||
| 510 | if (*md5) { | ||
| 511 | opts->options |= OPTION_MD5; | ||
| 512 | size += TCPOLEN_MD5SIG_ALIGNED; | ||
| 513 | } | ||
| 514 | #else | ||
| 515 | *md5 = NULL; | ||
| 516 | #endif | ||
| 517 | |||
| 518 | /* we can't fit any SACK blocks in a packet with MD5 + TS | ||
| 519 | options. There was discussion about disabling SACK rather than TS in | ||
| 520 | order to fit in better with old, buggy kernels, but that was deemed | ||
| 521 | to be unnecessary. */ | ||
| 522 | doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok); | ||
| 523 | |||
| 524 | opts->mss = mss; | ||
| 525 | size += TCPOLEN_MSS_ALIGNED; | ||
| 526 | |||
| 527 | if (likely(ireq->wscale_ok)) { | ||
| 528 | opts->ws = ireq->rcv_wscale; | ||
| 529 | if(likely(opts->ws)) | ||
| 530 | size += TCPOLEN_WSCALE_ALIGNED; | ||
| 531 | } | ||
| 532 | if (likely(doing_ts)) { | ||
| 533 | opts->options |= OPTION_TS; | ||
| 534 | opts->tsval = TCP_SKB_CB(skb)->when; | ||
| 535 | opts->tsecr = req->ts_recent; | ||
| 536 | size += TCPOLEN_TSTAMP_ALIGNED; | ||
| 537 | } | ||
| 538 | if (likely(ireq->sack_ok)) { | ||
| 539 | opts->options |= OPTION_SACK_ADVERTISE; | ||
| 540 | if (unlikely(!doing_ts)) | ||
| 541 | size += TCPOLEN_SACKPERM_ALIGNED; | ||
| 542 | } | ||
| 543 | |||
| 544 | return size; | ||
| 545 | } | ||
| 546 | |||
| 547 | static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, | ||
| 548 | struct tcp_out_options *opts, | ||
| 549 | struct tcp_md5sig_key **md5) { | ||
| 550 | struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; | ||
| 551 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 552 | unsigned size = 0; | ||
| 553 | |||
| 554 | #ifdef CONFIG_TCP_MD5SIG | ||
| 555 | *md5 = tp->af_specific->md5_lookup(sk, sk); | ||
| 556 | if (unlikely(*md5)) { | ||
| 557 | opts->options |= OPTION_MD5; | ||
| 558 | size += TCPOLEN_MD5SIG_ALIGNED; | ||
| 454 | } | 559 | } |
| 560 | #else | ||
| 561 | *md5 = NULL; | ||
| 455 | #endif | 562 | #endif |
| 563 | |||
| 564 | if (likely(tp->rx_opt.tstamp_ok)) { | ||
| 565 | opts->options |= OPTION_TS; | ||
| 566 | opts->tsval = tcb ? tcb->when : 0; | ||
| 567 | opts->tsecr = tp->rx_opt.ts_recent; | ||
| 568 | size += TCPOLEN_TSTAMP_ALIGNED; | ||
| 569 | } | ||
| 570 | |||
| 571 | if (unlikely(tp->rx_opt.eff_sacks)) { | ||
| 572 | const unsigned remaining = MAX_TCP_OPTION_SPACE - size; | ||
| 573 | opts->num_sack_blocks = | ||
| 574 | min_t(unsigned, tp->rx_opt.eff_sacks, | ||
| 575 | (remaining - TCPOLEN_SACK_BASE_ALIGNED) / | ||
| 576 | TCPOLEN_SACK_PERBLOCK); | ||
| 577 | size += TCPOLEN_SACK_BASE_ALIGNED + | ||
| 578 | opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; | ||
| 579 | } | ||
| 580 | |||
| 581 | return size; | ||
| 456 | } | 582 | } |
| 457 | 583 | ||
| 458 | /* This routine actually transmits TCP packets queued in by | 584 | /* This routine actually transmits TCP packets queued in by |
| @@ -473,13 +599,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 473 | struct inet_sock *inet; | 599 | struct inet_sock *inet; |
| 474 | struct tcp_sock *tp; | 600 | struct tcp_sock *tp; |
| 475 | struct tcp_skb_cb *tcb; | 601 | struct tcp_skb_cb *tcb; |
| 476 | int tcp_header_size; | 602 | struct tcp_out_options opts; |
| 477 | #ifdef CONFIG_TCP_MD5SIG | 603 | unsigned tcp_options_size, tcp_header_size; |
| 478 | struct tcp_md5sig_key *md5; | 604 | struct tcp_md5sig_key *md5; |
| 479 | __u8 *md5_hash_location; | 605 | __u8 *md5_hash_location; |
| 480 | #endif | ||
| 481 | struct tcphdr *th; | 606 | struct tcphdr *th; |
| 482 | int sysctl_flags; | ||
| 483 | int err; | 607 | int err; |
| 484 | 608 | ||
| 485 | BUG_ON(!skb || !tcp_skb_pcount(skb)); | 609 | BUG_ON(!skb || !tcp_skb_pcount(skb)); |
| @@ -502,50 +626,18 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 502 | inet = inet_sk(sk); | 626 | inet = inet_sk(sk); |
| 503 | tp = tcp_sk(sk); | 627 | tp = tcp_sk(sk); |
| 504 | tcb = TCP_SKB_CB(skb); | 628 | tcb = TCP_SKB_CB(skb); |
| 505 | tcp_header_size = tp->tcp_header_len; | 629 | memset(&opts, 0, sizeof(opts)); |
| 506 | |||
| 507 | #define SYSCTL_FLAG_TSTAMPS 0x1 | ||
| 508 | #define SYSCTL_FLAG_WSCALE 0x2 | ||
| 509 | #define SYSCTL_FLAG_SACK 0x4 | ||
| 510 | 630 | ||
| 511 | sysctl_flags = 0; | 631 | if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) |
| 512 | if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { | 632 | tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); |
| 513 | tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; | 633 | else |
| 514 | if (sysctl_tcp_timestamps) { | 634 | tcp_options_size = tcp_established_options(sk, skb, &opts, |
| 515 | tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; | 635 | &md5); |
| 516 | sysctl_flags |= SYSCTL_FLAG_TSTAMPS; | 636 | tcp_header_size = tcp_options_size + sizeof(struct tcphdr); |
| 517 | } | ||
| 518 | if (sysctl_tcp_window_scaling) { | ||
| 519 | tcp_header_size += TCPOLEN_WSCALE_ALIGNED; | ||
| 520 | sysctl_flags |= SYSCTL_FLAG_WSCALE; | ||
| 521 | } | ||
| 522 | if (sysctl_tcp_sack) { | ||
| 523 | sysctl_flags |= SYSCTL_FLAG_SACK; | ||
| 524 | if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) | ||
| 525 | tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; | ||
| 526 | } | ||
| 527 | } else if (unlikely(tp->rx_opt.eff_sacks)) { | ||
| 528 | /* A SACK is 2 pad bytes, a 2 byte header, plus | ||
| 529 | * 2 32-bit sequence numbers for each SACK block. | ||
| 530 | */ | ||
| 531 | tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + | ||
| 532 | (tp->rx_opt.eff_sacks * | ||
| 533 | TCPOLEN_SACK_PERBLOCK)); | ||
| 534 | } | ||
| 535 | 637 | ||
| 536 | if (tcp_packets_in_flight(tp) == 0) | 638 | if (tcp_packets_in_flight(tp) == 0) |
| 537 | tcp_ca_event(sk, CA_EVENT_TX_START); | 639 | tcp_ca_event(sk, CA_EVENT_TX_START); |
| 538 | 640 | ||
| 539 | #ifdef CONFIG_TCP_MD5SIG | ||
| 540 | /* | ||
| 541 | * Are we doing MD5 on this segment? If so - make | ||
| 542 | * room for it. | ||
| 543 | */ | ||
| 544 | md5 = tp->af_specific->md5_lookup(sk, sk); | ||
| 545 | if (md5) | ||
| 546 | tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; | ||
| 547 | #endif | ||
| 548 | |||
| 549 | skb_push(skb, tcp_header_size); | 641 | skb_push(skb, tcp_header_size); |
| 550 | skb_reset_transport_header(skb); | 642 | skb_reset_transport_header(skb); |
| 551 | skb_set_owner_w(skb, sk); | 643 | skb_set_owner_w(skb, sk); |
| @@ -570,45 +662,23 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 570 | th->check = 0; | 662 | th->check = 0; |
| 571 | th->urg_ptr = 0; | 663 | th->urg_ptr = 0; |
| 572 | 664 | ||
| 573 | if (unlikely(tp->urg_mode && | 665 | /* The urg_mode check is necessary during a below snd_una win probe */ |
| 666 | if (unlikely(tcp_urg_mode(tp) && | ||
| 574 | between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { | 667 | between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { |
| 575 | th->urg_ptr = htons(tp->snd_up - tcb->seq); | 668 | th->urg_ptr = htons(tp->snd_up - tcb->seq); |
| 576 | th->urg = 1; | 669 | th->urg = 1; |
| 577 | } | 670 | } |
| 578 | 671 | ||
| 579 | if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { | 672 | tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); |
| 580 | tcp_syn_build_options((__be32 *)(th + 1), | 673 | if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) |
| 581 | tcp_advertise_mss(sk), | ||
| 582 | (sysctl_flags & SYSCTL_FLAG_TSTAMPS), | ||
| 583 | (sysctl_flags & SYSCTL_FLAG_SACK), | ||
| 584 | (sysctl_flags & SYSCTL_FLAG_WSCALE), | ||
| 585 | tp->rx_opt.rcv_wscale, | ||
| 586 | tcb->when, | ||
| 587 | tp->rx_opt.ts_recent, | ||
| 588 | |||
| 589 | #ifdef CONFIG_TCP_MD5SIG | ||
| 590 | md5 ? &md5_hash_location : | ||
| 591 | #endif | ||
| 592 | NULL); | ||
| 593 | } else { | ||
| 594 | tcp_build_and_update_options((__be32 *)(th + 1), | ||
| 595 | tp, tcb->when, | ||
| 596 | #ifdef CONFIG_TCP_MD5SIG | ||
| 597 | md5 ? &md5_hash_location : | ||
| 598 | #endif | ||
| 599 | NULL); | ||
| 600 | TCP_ECN_send(sk, skb, tcp_header_size); | 674 | TCP_ECN_send(sk, skb, tcp_header_size); |
| 601 | } | ||
| 602 | 675 | ||
| 603 | #ifdef CONFIG_TCP_MD5SIG | 676 | #ifdef CONFIG_TCP_MD5SIG |
| 604 | /* Calculate the MD5 hash, as we have all we need now */ | 677 | /* Calculate the MD5 hash, as we have all we need now */ |
| 605 | if (md5) { | 678 | if (md5) { |
| 679 | sk->sk_route_caps &= ~NETIF_F_GSO_MASK; | ||
| 606 | tp->af_specific->calc_md5_hash(md5_hash_location, | 680 | tp->af_specific->calc_md5_hash(md5_hash_location, |
| 607 | md5, | 681 | md5, sk, NULL, skb); |
| 608 | sk, NULL, NULL, | ||
| 609 | tcp_hdr(skb), | ||
| 610 | sk->sk_protocol, | ||
| 611 | skb->len); | ||
| 612 | } | 682 | } |
| 613 | #endif | 683 | #endif |
| 614 | 684 | ||
| @@ -621,7 +691,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 621 | tcp_event_data_sent(tp, skb, sk); | 691 | tcp_event_data_sent(tp, skb, sk); |
| 622 | 692 | ||
| 623 | if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) | 693 | if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) |
| 624 | TCP_INC_STATS(TCP_MIB_OUTSEGS); | 694 | TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); |
| 625 | 695 | ||
| 626 | err = icsk->icsk_af_ops->queue_xmit(skb, 0); | 696 | err = icsk->icsk_af_ops->queue_xmit(skb, 0); |
| 627 | if (likely(err <= 0)) | 697 | if (likely(err <= 0)) |
| @@ -630,10 +700,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |||
| 630 | tcp_enter_cwr(sk, 1); | 700 | tcp_enter_cwr(sk, 1); |
| 631 | 701 | ||
| 632 | return net_xmit_eval(err); | 702 | return net_xmit_eval(err); |
| 633 | |||
| 634 | #undef SYSCTL_FLAG_TSTAMPS | ||
| 635 | #undef SYSCTL_FLAG_WSCALE | ||
| 636 | #undef SYSCTL_FLAG_SACK | ||
| 637 | } | 703 | } |
| 638 | 704 | ||
| 639 | /* This routine just queue's the buffer | 705 | /* This routine just queue's the buffer |
| @@ -963,7 +1029,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) | |||
| 963 | /* Compute the current effective MSS, taking SACKs and IP options, | 1029 | /* Compute the current effective MSS, taking SACKs and IP options, |
| 964 | * and even PMTU discovery events into account. | 1030 | * and even PMTU discovery events into account. |
| 965 | * | 1031 | * |
| 966 | * LARGESEND note: !urg_mode is overkill, only frames up to snd_up | 1032 | * LARGESEND note: !tcp_urg_mode is overkill, only frames up to snd_up |
| 967 | * cannot be large. However, taking into account rare use of URG, this | 1033 | * cannot be large. However, taking into account rare use of URG, this |
| 968 | * is not a big flaw. | 1034 | * is not a big flaw. |
| 969 | */ | 1035 | */ |
| @@ -974,10 +1040,13 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) | |||
| 974 | u32 mss_now; | 1040 | u32 mss_now; |
| 975 | u16 xmit_size_goal; | 1041 | u16 xmit_size_goal; |
| 976 | int doing_tso = 0; | 1042 | int doing_tso = 0; |
| 1043 | unsigned header_len; | ||
| 1044 | struct tcp_out_options opts; | ||
| 1045 | struct tcp_md5sig_key *md5; | ||
| 977 | 1046 | ||
| 978 | mss_now = tp->mss_cache; | 1047 | mss_now = tp->mss_cache; |
| 979 | 1048 | ||
| 980 | if (large_allowed && sk_can_gso(sk) && !tp->urg_mode) | 1049 | if (large_allowed && sk_can_gso(sk) && !tcp_urg_mode(tp)) |
| 981 | doing_tso = 1; | 1050 | doing_tso = 1; |
| 982 | 1051 | ||
| 983 | if (dst) { | 1052 | if (dst) { |
| @@ -986,14 +1055,16 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) | |||
| 986 | mss_now = tcp_sync_mss(sk, mtu); | 1055 | mss_now = tcp_sync_mss(sk, mtu); |
| 987 | } | 1056 | } |
| 988 | 1057 | ||
| 989 | if (tp->rx_opt.eff_sacks) | 1058 | header_len = tcp_established_options(sk, NULL, &opts, &md5) + |
| 990 | mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + | 1059 | sizeof(struct tcphdr); |
| 991 | (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); | 1060 | /* The mss_cache is sized based on tp->tcp_header_len, which assumes |
| 992 | 1061 | * some common options. If this is an odd packet (because we have SACK | |
| 993 | #ifdef CONFIG_TCP_MD5SIG | 1062 | * blocks etc) then our calculated header_len will be different, and |
| 994 | if (tp->af_specific->md5_lookup(sk, sk)) | 1063 | * we have to adjust mss_now correspondingly */ |
| 995 | mss_now -= TCPOLEN_MD5SIG_ALIGNED; | 1064 | if (header_len != tp->tcp_header_len) { |
| 996 | #endif | 1065 | int delta = (int) header_len - tp->tcp_header_len; |
| 1066 | mss_now -= delta; | ||
| 1067 | } | ||
| 997 | 1068 | ||
| 998 | xmit_size_goal = mss_now; | 1069 | xmit_size_goal = mss_now; |
| 999 | 1070 | ||
| @@ -1139,7 +1210,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, | |||
| 1139 | /* Don't use the nagle rule for urgent data (or for the final FIN). | 1210 | /* Don't use the nagle rule for urgent data (or for the final FIN). |
| 1140 | * Nagle can be ignored during F-RTO too (see RFC4138). | 1211 | * Nagle can be ignored during F-RTO too (see RFC4138). |
| 1141 | */ | 1212 | */ |
| 1142 | if (tp->urg_mode || (tp->frto_counter == 2) || | 1213 | if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || |
| 1143 | (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) | 1214 | (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) |
| 1144 | return 1; | 1215 | return 1; |
| 1145 | 1216 | ||
| @@ -1770,6 +1841,8 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, | |||
| 1770 | 1841 | ||
| 1771 | /* changed transmit queue under us so clear hints */ | 1842 | /* changed transmit queue under us so clear hints */ |
| 1772 | tcp_clear_retrans_hints_partial(tp); | 1843 | tcp_clear_retrans_hints_partial(tp); |
| 1844 | if (next_skb == tp->retransmit_skb_hint) | ||
| 1845 | tp->retransmit_skb_hint = skb; | ||
| 1773 | 1846 | ||
| 1774 | sk_wmem_free_skb(sk, next_skb); | 1847 | sk_wmem_free_skb(sk, next_skb); |
| 1775 | } | 1848 | } |
| @@ -1784,7 +1857,7 @@ void tcp_simple_retransmit(struct sock *sk) | |||
| 1784 | struct tcp_sock *tp = tcp_sk(sk); | 1857 | struct tcp_sock *tp = tcp_sk(sk); |
| 1785 | struct sk_buff *skb; | 1858 | struct sk_buff *skb; |
| 1786 | unsigned int mss = tcp_current_mss(sk, 0); | 1859 | unsigned int mss = tcp_current_mss(sk, 0); |
| 1787 | int lost = 0; | 1860 | u32 prior_lost = tp->lost_out; |
| 1788 | 1861 | ||
| 1789 | tcp_for_write_queue(skb, sk) { | 1862 | tcp_for_write_queue(skb, sk) { |
| 1790 | if (skb == tcp_send_head(sk)) | 1863 | if (skb == tcp_send_head(sk)) |
| @@ -1795,17 +1868,13 @@ void tcp_simple_retransmit(struct sock *sk) | |||
| 1795 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; | 1868 | TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; |
| 1796 | tp->retrans_out -= tcp_skb_pcount(skb); | 1869 | tp->retrans_out -= tcp_skb_pcount(skb); |
| 1797 | } | 1870 | } |
| 1798 | if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) { | 1871 | tcp_skb_mark_lost_uncond_verify(tp, skb); |
| 1799 | TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | ||
| 1800 | tp->lost_out += tcp_skb_pcount(skb); | ||
| 1801 | lost = 1; | ||
| 1802 | } | ||
| 1803 | } | 1872 | } |
| 1804 | } | 1873 | } |
| 1805 | 1874 | ||
| 1806 | tcp_clear_all_retrans_hints(tp); | 1875 | tcp_clear_retrans_hints_partial(tp); |
| 1807 | 1876 | ||
| 1808 | if (!lost) | 1877 | if (prior_lost == tp->lost_out) |
| 1809 | return; | 1878 | return; |
| 1810 | 1879 | ||
| 1811 | if (tcp_is_reno(tp)) | 1880 | if (tcp_is_reno(tp)) |
| @@ -1880,8 +1949,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 1880 | /* Collapse two adjacent packets if worthwhile and we can. */ | 1949 | /* Collapse two adjacent packets if worthwhile and we can. */ |
| 1881 | if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && | 1950 | if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && |
| 1882 | (skb->len < (cur_mss >> 1)) && | 1951 | (skb->len < (cur_mss >> 1)) && |
| 1883 | (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && | ||
| 1884 | (!tcp_skb_is_last(sk, skb)) && | 1952 | (!tcp_skb_is_last(sk, skb)) && |
| 1953 | (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && | ||
| 1885 | (skb_shinfo(skb)->nr_frags == 0 && | 1954 | (skb_shinfo(skb)->nr_frags == 0 && |
| 1886 | skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && | 1955 | skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && |
| 1887 | (tcp_skb_pcount(skb) == 1 && | 1956 | (tcp_skb_pcount(skb) == 1 && |
| @@ -1913,7 +1982,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 1913 | 1982 | ||
| 1914 | if (err == 0) { | 1983 | if (err == 0) { |
| 1915 | /* Update global TCP statistics. */ | 1984 | /* Update global TCP statistics. */ |
| 1916 | TCP_INC_STATS(TCP_MIB_RETRANSSEGS); | 1985 | TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); |
| 1917 | 1986 | ||
| 1918 | tp->total_retrans++; | 1987 | tp->total_retrans++; |
| 1919 | 1988 | ||
| @@ -1942,83 +2011,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | |||
| 1942 | return err; | 2011 | return err; |
| 1943 | } | 2012 | } |
| 1944 | 2013 | ||
| 1945 | /* This gets called after a retransmit timeout, and the initially | 2014 | static int tcp_can_forward_retransmit(struct sock *sk) |
| 1946 | * retransmitted data is acknowledged. It tries to continue | ||
| 1947 | * resending the rest of the retransmit queue, until either | ||
| 1948 | * we've sent it all or the congestion window limit is reached. | ||
| 1949 | * If doing SACK, the first ACK which comes back for a timeout | ||
| 1950 | * based retransmit packet might feed us FACK information again. | ||
| 1951 | * If so, we use it to avoid unnecessarily retransmissions. | ||
| 1952 | */ | ||
| 1953 | void tcp_xmit_retransmit_queue(struct sock *sk) | ||
| 1954 | { | 2015 | { |
| 1955 | const struct inet_connection_sock *icsk = inet_csk(sk); | 2016 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 1956 | struct tcp_sock *tp = tcp_sk(sk); | 2017 | struct tcp_sock *tp = tcp_sk(sk); |
| 1957 | struct sk_buff *skb; | ||
| 1958 | int packet_cnt; | ||
| 1959 | |||
| 1960 | if (tp->retransmit_skb_hint) { | ||
| 1961 | skb = tp->retransmit_skb_hint; | ||
| 1962 | packet_cnt = tp->retransmit_cnt_hint; | ||
| 1963 | } else { | ||
| 1964 | skb = tcp_write_queue_head(sk); | ||
| 1965 | packet_cnt = 0; | ||
| 1966 | } | ||
| 1967 | |||
| 1968 | /* First pass: retransmit lost packets. */ | ||
| 1969 | if (tp->lost_out) { | ||
| 1970 | tcp_for_write_queue_from(skb, sk) { | ||
| 1971 | __u8 sacked = TCP_SKB_CB(skb)->sacked; | ||
| 1972 | |||
| 1973 | if (skb == tcp_send_head(sk)) | ||
| 1974 | break; | ||
| 1975 | /* we could do better than to assign each time */ | ||
| 1976 | tp->retransmit_skb_hint = skb; | ||
| 1977 | tp->retransmit_cnt_hint = packet_cnt; | ||
| 1978 | |||
| 1979 | /* Assume this retransmit will generate | ||
| 1980 | * only one packet for congestion window | ||
| 1981 | * calculation purposes. This works because | ||
| 1982 | * tcp_retransmit_skb() will chop up the | ||
| 1983 | * packet to be MSS sized and all the | ||
| 1984 | * packet counting works out. | ||
| 1985 | */ | ||
| 1986 | if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) | ||
| 1987 | return; | ||
| 1988 | |||
| 1989 | if (sacked & TCPCB_LOST) { | ||
| 1990 | if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { | ||
| 1991 | if (tcp_retransmit_skb(sk, skb)) { | ||
| 1992 | tp->retransmit_skb_hint = NULL; | ||
| 1993 | return; | ||
| 1994 | } | ||
| 1995 | if (icsk->icsk_ca_state != TCP_CA_Loss) | ||
| 1996 | NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); | ||
| 1997 | else | ||
| 1998 | NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); | ||
| 1999 | |||
| 2000 | if (skb == tcp_write_queue_head(sk)) | ||
| 2001 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | ||
| 2002 | inet_csk(sk)->icsk_rto, | ||
| 2003 | TCP_RTO_MAX); | ||
| 2004 | } | ||
| 2005 | |||
| 2006 | packet_cnt += tcp_skb_pcount(skb); | ||
| 2007 | if (packet_cnt >= tp->lost_out) | ||
| 2008 | break; | ||
| 2009 | } | ||
| 2010 | } | ||
| 2011 | } | ||
| 2012 | |||
| 2013 | /* OK, demanded retransmission is finished. */ | ||
| 2014 | 2018 | ||
| 2015 | /* Forward retransmissions are possible only during Recovery. */ | 2019 | /* Forward retransmissions are possible only during Recovery. */ |
| 2016 | if (icsk->icsk_ca_state != TCP_CA_Recovery) | 2020 | if (icsk->icsk_ca_state != TCP_CA_Recovery) |
| 2017 | return; | 2021 | return 0; |
| 2018 | 2022 | ||
| 2019 | /* No forward retransmissions in Reno are possible. */ | 2023 | /* No forward retransmissions in Reno are possible. */ |
| 2020 | if (tcp_is_reno(tp)) | 2024 | if (tcp_is_reno(tp)) |
| 2021 | return; | 2025 | return 0; |
| 2022 | 2026 | ||
| 2023 | /* Yeah, we have to make difficult choice between forward transmission | 2027 | /* Yeah, we have to make difficult choice between forward transmission |
| 2024 | * and retransmission... Both ways have their merits... | 2028 | * and retransmission... Both ways have their merits... |
| @@ -2029,43 +2033,104 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | |||
| 2029 | */ | 2033 | */ |
| 2030 | 2034 | ||
| 2031 | if (tcp_may_send_now(sk)) | 2035 | if (tcp_may_send_now(sk)) |
| 2032 | return; | 2036 | return 0; |
| 2033 | 2037 | ||
| 2034 | /* If nothing is SACKed, highest_sack in the loop won't be valid */ | 2038 | return 1; |
| 2035 | if (!tp->sacked_out) | 2039 | } |
| 2036 | return; | ||
| 2037 | 2040 | ||
| 2038 | if (tp->forward_skb_hint) | 2041 | /* This gets called after a retransmit timeout, and the initially |
| 2039 | skb = tp->forward_skb_hint; | 2042 | * retransmitted data is acknowledged. It tries to continue |
| 2040 | else | 2043 | * resending the rest of the retransmit queue, until either |
| 2044 | * we've sent it all or the congestion window limit is reached. | ||
| 2045 | * If doing SACK, the first ACK which comes back for a timeout | ||
| 2046 | * based retransmit packet might feed us FACK information again. | ||
| 2047 | * If so, we use it to avoid unnecessarily retransmissions. | ||
| 2048 | */ | ||
| 2049 | void tcp_xmit_retransmit_queue(struct sock *sk) | ||
| 2050 | { | ||
| 2051 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
| 2052 | struct tcp_sock *tp = tcp_sk(sk); | ||
| 2053 | struct sk_buff *skb; | ||
| 2054 | struct sk_buff *hole = NULL; | ||
| 2055 | u32 last_lost; | ||
| 2056 | int mib_idx; | ||
| 2057 | int fwd_rexmitting = 0; | ||
| 2058 | |||
| 2059 | if (!tp->lost_out) | ||
| 2060 | tp->retransmit_high = tp->snd_una; | ||
| 2061 | |||
| 2062 | if (tp->retransmit_skb_hint) { | ||
| 2063 | skb = tp->retransmit_skb_hint; | ||
| 2064 | last_lost = TCP_SKB_CB(skb)->end_seq; | ||
| 2065 | if (after(last_lost, tp->retransmit_high)) | ||
| 2066 | last_lost = tp->retransmit_high; | ||
| 2067 | } else { | ||
| 2041 | skb = tcp_write_queue_head(sk); | 2068 | skb = tcp_write_queue_head(sk); |
| 2069 | last_lost = tp->snd_una; | ||
| 2070 | } | ||
| 2042 | 2071 | ||
| 2072 | /* First pass: retransmit lost packets. */ | ||
| 2043 | tcp_for_write_queue_from(skb, sk) { | 2073 | tcp_for_write_queue_from(skb, sk) { |
| 2044 | if (skb == tcp_send_head(sk)) | 2074 | __u8 sacked = TCP_SKB_CB(skb)->sacked; |
| 2045 | break; | ||
| 2046 | tp->forward_skb_hint = skb; | ||
| 2047 | 2075 | ||
| 2048 | if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) | 2076 | if (skb == tcp_send_head(sk)) |
| 2049 | break; | 2077 | break; |
| 2078 | /* we could do better than to assign each time */ | ||
| 2079 | if (hole == NULL) | ||
| 2080 | tp->retransmit_skb_hint = skb; | ||
| 2050 | 2081 | ||
| 2082 | /* Assume this retransmit will generate | ||
| 2083 | * only one packet for congestion window | ||
| 2084 | * calculation purposes. This works because | ||
| 2085 | * tcp_retransmit_skb() will chop up the | ||
| 2086 | * packet to be MSS sized and all the | ||
| 2087 | * packet counting works out. | ||
| 2088 | */ | ||
| 2051 | if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) | 2089 | if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) |
| 2052 | break; | 2090 | return; |
| 2091 | |||
| 2092 | if (fwd_rexmitting) { | ||
| 2093 | begin_fwd: | ||
| 2094 | if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) | ||
| 2095 | break; | ||
| 2096 | mib_idx = LINUX_MIB_TCPFORWARDRETRANS; | ||
| 2097 | |||
| 2098 | } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) { | ||
| 2099 | tp->retransmit_high = last_lost; | ||
| 2100 | if (!tcp_can_forward_retransmit(sk)) | ||
| 2101 | break; | ||
| 2102 | /* Backtrack if necessary to non-L'ed skb */ | ||
| 2103 | if (hole != NULL) { | ||
| 2104 | skb = hole; | ||
| 2105 | hole = NULL; | ||
| 2106 | } | ||
| 2107 | fwd_rexmitting = 1; | ||
| 2108 | goto begin_fwd; | ||
| 2053 | 2109 | ||
| 2054 | if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) | 2110 | } else if (!(sacked & TCPCB_LOST)) { |
| 2111 | if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS)) | ||
| 2112 | hole = skb; | ||
| 2055 | continue; | 2113 | continue; |
| 2056 | 2114 | ||
| 2057 | /* Ok, retransmit it. */ | 2115 | } else { |
| 2058 | if (tcp_retransmit_skb(sk, skb)) { | 2116 | last_lost = TCP_SKB_CB(skb)->end_seq; |
| 2059 | tp->forward_skb_hint = NULL; | 2117 | if (icsk->icsk_ca_state != TCP_CA_Loss) |
| 2060 | break; | 2118 | mib_idx = LINUX_MIB_TCPFASTRETRANS; |
| 2119 | else | ||
| 2120 | mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; | ||
| 2061 | } | 2121 | } |
| 2062 | 2122 | ||
| 2123 | if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) | ||
| 2124 | continue; | ||
| 2125 | |||
| 2126 | if (tcp_retransmit_skb(sk, skb)) | ||
| 2127 | return; | ||
| 2128 | NET_INC_STATS_BH(sock_net(sk), mib_idx); | ||
| 2129 | |||
| 2063 | if (skb == tcp_write_queue_head(sk)) | 2130 | if (skb == tcp_write_queue_head(sk)) |
| 2064 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 2131 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
| 2065 | inet_csk(sk)->icsk_rto, | 2132 | inet_csk(sk)->icsk_rto, |
| 2066 | TCP_RTO_MAX); | 2133 | TCP_RTO_MAX); |
| 2067 | |||
| 2068 | NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); | ||
| 2069 | } | 2134 | } |
| 2070 | } | 2135 | } |
| 2071 | 2136 | ||
| @@ -2119,7 +2184,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) | |||
| 2119 | /* NOTE: No TCP options attached and we never retransmit this. */ | 2184 | /* NOTE: No TCP options attached and we never retransmit this. */ |
| 2120 | skb = alloc_skb(MAX_TCP_HEADER, priority); | 2185 | skb = alloc_skb(MAX_TCP_HEADER, priority); |
| 2121 | if (!skb) { | 2186 | if (!skb) { |
| 2122 | NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); | 2187 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); |
| 2123 | return; | 2188 | return; |
| 2124 | } | 2189 | } |
| 2125 | 2190 | ||
| @@ -2130,9 +2195,9 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) | |||
| 2130 | /* Send it off. */ | 2195 | /* Send it off. */ |
| 2131 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 2196 | TCP_SKB_CB(skb)->when = tcp_time_stamp; |
| 2132 | if (tcp_transmit_skb(sk, skb, 0, priority)) | 2197 | if (tcp_transmit_skb(sk, skb, 0, priority)) |
| 2133 | NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); | 2198 | NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); |
| 2134 | 2199 | ||
| 2135 | TCP_INC_STATS(TCP_MIB_OUTRSTS); | 2200 | TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); |
| 2136 | } | 2201 | } |
| 2137 | 2202 | ||
| 2138 | /* WARNING: This routine must only be called when we have already sent | 2203 | /* WARNING: This routine must only be called when we have already sent |
| @@ -2180,11 +2245,11 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
| 2180 | struct tcp_sock *tp = tcp_sk(sk); | 2245 | struct tcp_sock *tp = tcp_sk(sk); |
| 2181 | struct tcphdr *th; | 2246 | struct tcphdr *th; |
| 2182 | int tcp_header_size; | 2247 | int tcp_header_size; |
| 2248 | struct tcp_out_options opts; | ||
| 2183 | struct sk_buff *skb; | 2249 | struct sk_buff *skb; |
| 2184 | #ifdef CONFIG_TCP_MD5SIG | ||
| 2185 | struct tcp_md5sig_key *md5; | 2250 | struct tcp_md5sig_key *md5; |
| 2186 | __u8 *md5_hash_location; | 2251 | __u8 *md5_hash_location; |
| 2187 | #endif | 2252 | int mss; |
| 2188 | 2253 | ||
| 2189 | skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); | 2254 | skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); |
| 2190 | if (skb == NULL) | 2255 | if (skb == NULL) |
| @@ -2195,18 +2260,30 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
| 2195 | 2260 | ||
| 2196 | skb->dst = dst_clone(dst); | 2261 | skb->dst = dst_clone(dst); |
| 2197 | 2262 | ||
| 2198 | tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + | 2263 | mss = dst_metric(dst, RTAX_ADVMSS); |
| 2199 | (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + | 2264 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) |
| 2200 | (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + | 2265 | mss = tp->rx_opt.user_mss; |
| 2201 | /* SACK_PERM is in the place of NOP NOP of TS */ | 2266 | |
| 2202 | ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); | 2267 | if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ |
| 2268 | __u8 rcv_wscale; | ||
| 2269 | /* Set this up on the first call only */ | ||
| 2270 | req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); | ||
| 2271 | /* tcp_full_space because it is guaranteed to be the first packet */ | ||
| 2272 | tcp_select_initial_window(tcp_full_space(sk), | ||
| 2273 | mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), | ||
| 2274 | &req->rcv_wnd, | ||
| 2275 | &req->window_clamp, | ||
| 2276 | ireq->wscale_ok, | ||
| 2277 | &rcv_wscale); | ||
| 2278 | ireq->rcv_wscale = rcv_wscale; | ||
| 2279 | } | ||
| 2280 | |||
| 2281 | memset(&opts, 0, sizeof(opts)); | ||
| 2282 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | ||
| 2283 | tcp_header_size = tcp_synack_options(sk, req, mss, | ||
| 2284 | skb, &opts, &md5) + | ||
| 2285 | sizeof(struct tcphdr); | ||
| 2203 | 2286 | ||
| 2204 | #ifdef CONFIG_TCP_MD5SIG | ||
| 2205 | /* Are we doing MD5 on this segment? If so - make room for it */ | ||
| 2206 | md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); | ||
| 2207 | if (md5) | ||
| 2208 | tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; | ||
| 2209 | #endif | ||
| 2210 | skb_push(skb, tcp_header_size); | 2287 | skb_push(skb, tcp_header_size); |
| 2211 | skb_reset_transport_header(skb); | 2288 | skb_reset_transport_header(skb); |
| 2212 | 2289 | ||
| @@ -2215,7 +2292,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
| 2215 | th->syn = 1; | 2292 | th->syn = 1; |
| 2216 | th->ack = 1; | 2293 | th->ack = 1; |
| 2217 | TCP_ECN_make_synack(req, th); | 2294 | TCP_ECN_make_synack(req, th); |
| 2218 | th->source = inet_sk(sk)->sport; | 2295 | th->source = ireq->loc_port; |
| 2219 | th->dest = ireq->rmt_port; | 2296 | th->dest = ireq->rmt_port; |
| 2220 | /* Setting of flags are superfluous here for callers (and ECE is | 2297 | /* Setting of flags are superfluous here for callers (and ECE is |
| 2221 | * not even correctly set) | 2298 | * not even correctly set) |
| @@ -2224,19 +2301,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
| 2224 | TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); | 2301 | TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); |
| 2225 | th->seq = htonl(TCP_SKB_CB(skb)->seq); | 2302 | th->seq = htonl(TCP_SKB_CB(skb)->seq); |
| 2226 | th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); | 2303 | th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); |
| 2227 | if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ | ||
| 2228 | __u8 rcv_wscale; | ||
| 2229 | /* Set this up on the first call only */ | ||
| 2230 | req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); | ||
| 2231 | /* tcp_full_space because it is guaranteed to be the first packet */ | ||
| 2232 | tcp_select_initial_window(tcp_full_space(sk), | ||
| 2233 | dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), | ||
| 2234 | &req->rcv_wnd, | ||
| 2235 | &req->window_clamp, | ||
| 2236 | ireq->wscale_ok, | ||
| 2237 | &rcv_wscale); | ||
| 2238 | ireq->rcv_wscale = rcv_wscale; | ||
| 2239 | } | ||
| 2240 | 2304 | ||
| 2241 | /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ | 2305 | /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ |
| 2242 | th->window = htons(min(req->rcv_wnd, 65535U)); | 2306 | th->window = htons(min(req->rcv_wnd, 65535U)); |
| @@ -2245,29 +2309,15 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, | |||
| 2245 | TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); | 2309 | TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); |
| 2246 | else | 2310 | else |
| 2247 | #endif | 2311 | #endif |
| 2248 | TCP_SKB_CB(skb)->when = tcp_time_stamp; | 2312 | tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); |
| 2249 | tcp_syn_build_options((__be32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok, | ||
| 2250 | ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale, | ||
| 2251 | TCP_SKB_CB(skb)->when, | ||
| 2252 | req->ts_recent, | ||
| 2253 | ( | ||
| 2254 | #ifdef CONFIG_TCP_MD5SIG | ||
| 2255 | md5 ? &md5_hash_location : | ||
| 2256 | #endif | ||
| 2257 | NULL) | ||
| 2258 | ); | ||
| 2259 | |||
| 2260 | th->doff = (tcp_header_size >> 2); | 2313 | th->doff = (tcp_header_size >> 2); |
| 2261 | TCP_INC_STATS(TCP_MIB_OUTSEGS); | 2314 | TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); |
| 2262 | 2315 | ||
| 2263 | #ifdef CONFIG_TCP_MD5SIG | 2316 | #ifdef CONFIG_TCP_MD5SIG |
| 2264 | /* Okay, we have all we need - do the md5 hash if needed */ | 2317 | /* Okay, we have all we need - do the md5 hash if needed */ |
| 2265 | if (md5) { | 2318 | if (md5) { |
| 2266 | tp->af_specific->calc_md5_hash(md5_hash_location, | 2319 | tp->af_specific->calc_md5_hash(md5_hash_location, |
| 2267 | md5, | 2320 | md5, NULL, req, skb); |
| 2268 | NULL, dst, req, | ||
| 2269 | tcp_hdr(skb), sk->sk_protocol, | ||
| 2270 | skb->len); | ||
| 2271 | } | 2321 | } |
| 2272 | #endif | 2322 | #endif |
| 2273 | 2323 | ||
| @@ -2304,6 +2354,9 @@ static void tcp_connect_init(struct sock *sk) | |||
| 2304 | if (!tp->window_clamp) | 2354 | if (!tp->window_clamp) |
| 2305 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); | 2355 | tp->window_clamp = dst_metric(dst, RTAX_WINDOW); |
| 2306 | tp->advmss = dst_metric(dst, RTAX_ADVMSS); | 2356 | tp->advmss = dst_metric(dst, RTAX_ADVMSS); |
| 2357 | if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) | ||
| 2358 | tp->advmss = tp->rx_opt.user_mss; | ||
| 2359 | |||
| 2307 | tcp_initialize_rcv_mss(sk); | 2360 | tcp_initialize_rcv_mss(sk); |
| 2308 | 2361 | ||
| 2309 | tcp_select_initial_window(tcp_full_space(sk), | 2362 | tcp_select_initial_window(tcp_full_space(sk), |
| @@ -2322,6 +2375,7 @@ static void tcp_connect_init(struct sock *sk) | |||
| 2322 | tcp_init_wl(tp, tp->write_seq, 0); | 2375 | tcp_init_wl(tp, tp->write_seq, 0); |
| 2323 | tp->snd_una = tp->write_seq; | 2376 | tp->snd_una = tp->write_seq; |
| 2324 | tp->snd_sml = tp->write_seq; | 2377 | tp->snd_sml = tp->write_seq; |
| 2378 | tp->snd_up = tp->write_seq; | ||
| 2325 | tp->rcv_nxt = 0; | 2379 | tp->rcv_nxt = 0; |
| 2326 | tp->rcv_wup = 0; | 2380 | tp->rcv_wup = 0; |
| 2327 | tp->copied_seq = 0; | 2381 | tp->copied_seq = 0; |
| @@ -2367,7 +2421,7 @@ int tcp_connect(struct sock *sk) | |||
| 2367 | */ | 2421 | */ |
| 2368 | tp->snd_nxt = tp->write_seq; | 2422 | tp->snd_nxt = tp->write_seq; |
| 2369 | tp->pushed_seq = tp->write_seq; | 2423 | tp->pushed_seq = tp->write_seq; |
| 2370 | TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); | 2424 | TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); |
| 2371 | 2425 | ||
| 2372 | /* Timer for repeating the SYN until an answer. */ | 2426 | /* Timer for repeating the SYN until an answer. */ |
| 2373 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 2427 | inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
| @@ -2531,8 +2585,7 @@ int tcp_write_wakeup(struct sock *sk) | |||
| 2531 | tcp_event_new_data_sent(sk, skb); | 2585 | tcp_event_new_data_sent(sk, skb); |
| 2532 | return err; | 2586 | return err; |
| 2533 | } else { | 2587 | } else { |
| 2534 | if (tp->urg_mode && | 2588 | if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) |
| 2535 | between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) | ||
| 2536 | tcp_xmit_probe_skb(sk, 1); | 2589 | tcp_xmit_probe_skb(sk, 1); |
| 2537 | return tcp_xmit_probe_skb(sk, 0); | 2590 | return tcp_xmit_probe_skb(sk, 0); |
| 2538 | } | 2591 | } |
