aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorJeff Garzik <jgarzik@pobox.com>2005-07-13 16:23:51 -0400
committerJeff Garzik <jgarzik@pobox.com>2005-07-13 16:23:51 -0400
commit327309e899662b482c58cf25f574513d38b5788c (patch)
tree069de438aa0e92dd9b6ba28e6b207e2cd07151a5 /net/ipv4
parent0c168775709faa74c1b87f1e61046e0c51ade7f3 (diff)
parentc32511e2718618f0b53479eb36e07439aa363a74 (diff)
Merge upstream 2.6.13-rc3 into ieee80211 branch of netdev-2.6.
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig25
-rw-r--r--net/ipv4/af_inet.c11
-rw-r--r--net/ipv4/fib_trie.c202
-rw-r--r--net/ipv4/icmp.c3
-rw-r--r--net/ipv4/igmp.c96
-rw-r--r--net/ipv4/ip_output.c25
-rw-r--r--net/ipv4/ip_sockglue.c6
-rw-r--r--net/ipv4/ipvs/Kconfig4
-rw-r--r--net/ipv4/ipvs/ip_vs_conn.c6
-rw-r--r--net/ipv4/ipvs/ip_vs_ctl.c9
-rw-r--r--net/ipv4/netfilter/ip_conntrack_standalone.c7
-rw-r--r--net/ipv4/route.c128
-rw-r--r--net/ipv4/tcp.c52
-rw-r--r--net/ipv4/tcp_input.c87
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_output.c546
-rw-r--r--net/ipv4/tcp_timer.c5
17 files changed, 847 insertions, 367 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 3e63123f7bbd..df5386885a90 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -3,7 +3,6 @@
3# 3#
4config IP_MULTICAST 4config IP_MULTICAST
5 bool "IP: multicasting" 5 bool "IP: multicasting"
6 depends on INET
7 help 6 help
8 This is code for addressing several networked computers at once, 7 This is code for addressing several networked computers at once,
9 enlarging your kernel by about 2 KB. You need multicasting if you 8 enlarging your kernel by about 2 KB. You need multicasting if you
@@ -17,7 +16,6 @@ config IP_MULTICAST
17 16
18config IP_ADVANCED_ROUTER 17config IP_ADVANCED_ROUTER
19 bool "IP: advanced router" 18 bool "IP: advanced router"
20 depends on INET
21 ---help--- 19 ---help---
22 If you intend to run your Linux box mostly as a router, i.e. as a 20 If you intend to run your Linux box mostly as a router, i.e. as a
23 computer that forwards and redistributes network packets, say Y; you 21 computer that forwards and redistributes network packets, say Y; you
@@ -183,7 +181,6 @@ config IP_ROUTE_VERBOSE
183 181
184config IP_PNP 182config IP_PNP
185 bool "IP: kernel level autoconfiguration" 183 bool "IP: kernel level autoconfiguration"
186 depends on INET
187 help 184 help
188 This enables automatic configuration of IP addresses of devices and 185 This enables automatic configuration of IP addresses of devices and
189 of the routing table during kernel boot, based on either information 186 of the routing table during kernel boot, based on either information
@@ -242,7 +239,6 @@ config IP_PNP_RARP
242# bool ' IP: ARP support' CONFIG_IP_PNP_ARP 239# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
243config NET_IPIP 240config NET_IPIP
244 tristate "IP: tunneling" 241 tristate "IP: tunneling"
245 depends on INET
246 select INET_TUNNEL 242 select INET_TUNNEL
247 ---help--- 243 ---help---
248 Tunneling means encapsulating data of one protocol type within 244 Tunneling means encapsulating data of one protocol type within
@@ -260,7 +256,6 @@ config NET_IPIP
260 256
261config NET_IPGRE 257config NET_IPGRE
262 tristate "IP: GRE tunnels over IP" 258 tristate "IP: GRE tunnels over IP"
263 depends on INET
264 select XFRM 259 select XFRM
265 help 260 help
266 Tunneling means encapsulating data of one protocol type within 261 Tunneling means encapsulating data of one protocol type within
@@ -319,7 +314,7 @@ config IP_PIMSM_V2
319 314
320config ARPD 315config ARPD
321 bool "IP: ARP daemon support (EXPERIMENTAL)" 316 bool "IP: ARP daemon support (EXPERIMENTAL)"
322 depends on INET && EXPERIMENTAL 317 depends on EXPERIMENTAL
323 ---help--- 318 ---help---
324 Normally, the kernel maintains an internal cache which maps IP 319 Normally, the kernel maintains an internal cache which maps IP
325 addresses to hardware addresses on the local network, so that 320 addresses to hardware addresses on the local network, so that
@@ -344,7 +339,6 @@ config ARPD
344 339
345config SYN_COOKIES 340config SYN_COOKIES
346 bool "IP: TCP syncookie support (disabled per default)" 341 bool "IP: TCP syncookie support (disabled per default)"
347 depends on INET
348 ---help--- 342 ---help---
349 Normal TCP/IP networking is open to an attack known as "SYN 343 Normal TCP/IP networking is open to an attack known as "SYN
350 flooding". This denial-of-service attack prevents legitimate remote 344 flooding". This denial-of-service attack prevents legitimate remote
@@ -381,7 +375,6 @@ config SYN_COOKIES
381 375
382config INET_AH 376config INET_AH
383 tristate "IP: AH transformation" 377 tristate "IP: AH transformation"
384 depends on INET
385 select XFRM 378 select XFRM
386 select CRYPTO 379 select CRYPTO
387 select CRYPTO_HMAC 380 select CRYPTO_HMAC
@@ -394,7 +387,6 @@ config INET_AH
394 387
395config INET_ESP 388config INET_ESP
396 tristate "IP: ESP transformation" 389 tristate "IP: ESP transformation"
397 depends on INET
398 select XFRM 390 select XFRM
399 select CRYPTO 391 select CRYPTO
400 select CRYPTO_HMAC 392 select CRYPTO_HMAC
@@ -408,7 +400,6 @@ config INET_ESP
408 400
409config INET_IPCOMP 401config INET_IPCOMP
410 tristate "IP: IPComp transformation" 402 tristate "IP: IPComp transformation"
411 depends on INET
412 select XFRM 403 select XFRM
413 select INET_TUNNEL 404 select INET_TUNNEL
414 select CRYPTO 405 select CRYPTO
@@ -421,7 +412,6 @@ config INET_IPCOMP
421 412
422config INET_TUNNEL 413config INET_TUNNEL
423 tristate "IP: tunnel transformation" 414 tristate "IP: tunnel transformation"
424 depends on INET
425 select XFRM 415 select XFRM
426 ---help--- 416 ---help---
427 Support for generic IP tunnel transformation, which is required by 417 Support for generic IP tunnel transformation, which is required by
@@ -431,7 +421,6 @@ config INET_TUNNEL
431 421
432config IP_TCPDIAG 422config IP_TCPDIAG
433 tristate "IP: TCP socket monitoring interface" 423 tristate "IP: TCP socket monitoring interface"
434 depends on INET
435 default y 424 default y
436 ---help--- 425 ---help---
437 Support for TCP socket monitoring interface used by native Linux 426 Support for TCP socket monitoring interface used by native Linux
@@ -447,7 +436,6 @@ config IP_TCPDIAG_IPV6
447 436
448config TCP_CONG_ADVANCED 437config TCP_CONG_ADVANCED
449 bool "TCP: advanced congestion control" 438 bool "TCP: advanced congestion control"
450 depends on INET
451 ---help--- 439 ---help---
452 Support for selection of various TCP congestion control 440 Support for selection of various TCP congestion control
453 modules. 441 modules.
@@ -463,7 +451,6 @@ menu "TCP congestion control"
463 451
464config TCP_CONG_BIC 452config TCP_CONG_BIC
465 tristate "Binary Increase Congestion (BIC) control" 453 tristate "Binary Increase Congestion (BIC) control"
466 depends on INET
467 default y 454 default y
468 ---help--- 455 ---help---
469 BIC-TCP is a sender-side only change that ensures a linear RTT 456 BIC-TCP is a sender-side only change that ensures a linear RTT
@@ -478,7 +465,6 @@ config TCP_CONG_BIC
478 465
479config TCP_CONG_WESTWOOD 466config TCP_CONG_WESTWOOD
480 tristate "TCP Westwood+" 467 tristate "TCP Westwood+"
481 depends on INET
482 default m 468 default m
483 ---help--- 469 ---help---
484 TCP Westwood+ is a sender-side only modification of the TCP Reno 470 TCP Westwood+ is a sender-side only modification of the TCP Reno
@@ -493,7 +479,6 @@ config TCP_CONG_WESTWOOD
493 479
494config TCP_CONG_HTCP 480config TCP_CONG_HTCP
495 tristate "H-TCP" 481 tristate "H-TCP"
496 depends on INET
497 default m 482 default m
498 ---help--- 483 ---help---
499 H-TCP is a send-side only modifications of the TCP Reno 484 H-TCP is a send-side only modifications of the TCP Reno
@@ -505,7 +490,7 @@ config TCP_CONG_HTCP
505 490
506config TCP_CONG_HSTCP 491config TCP_CONG_HSTCP
507 tristate "High Speed TCP" 492 tristate "High Speed TCP"
508 depends on INET && EXPERIMENTAL 493 depends on EXPERIMENTAL
509 default n 494 default n
510 ---help--- 495 ---help---
511 Sally Floyd's High Speed TCP (RFC 3649) congestion control. 496 Sally Floyd's High Speed TCP (RFC 3649) congestion control.
@@ -516,7 +501,7 @@ config TCP_CONG_HSTCP
516 501
517config TCP_CONG_HYBLA 502config TCP_CONG_HYBLA
518 tristate "TCP-Hybla congestion control algorithm" 503 tristate "TCP-Hybla congestion control algorithm"
519 depends on INET && EXPERIMENTAL 504 depends on EXPERIMENTAL
520 default n 505 default n
521 ---help--- 506 ---help---
522 TCP-Hybla is a sender-side only change that eliminates penalization of 507 TCP-Hybla is a sender-side only change that eliminates penalization of
@@ -526,7 +511,7 @@ config TCP_CONG_HYBLA
526 511
527config TCP_CONG_VEGAS 512config TCP_CONG_VEGAS
528 tristate "TCP Vegas" 513 tristate "TCP Vegas"
529 depends on INET && EXPERIMENTAL 514 depends on EXPERIMENTAL
530 default n 515 default n
531 ---help--- 516 ---help---
532 TCP Vegas is a sender-side only change to TCP that anticipates 517 TCP Vegas is a sender-side only change to TCP that anticipates
@@ -537,7 +522,7 @@ config TCP_CONG_VEGAS
537 522
538config TCP_CONG_SCALABLE 523config TCP_CONG_SCALABLE
539 tristate "Scalable TCP" 524 tristate "Scalable TCP"
540 depends on INET && EXPERIMENTAL 525 depends on EXPERIMENTAL
541 default n 526 default n
542 ---help--- 527 ---help---
543 Scalable TCP is a sender-side only change to TCP which uses a 528 Scalable TCP is a sender-side only change to TCP which uses a
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 658e7977924d..ef7468376ae6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void)
1009static int ipv4_proc_init(void); 1009static int ipv4_proc_init(void);
1010extern void ipfrag_init(void); 1010extern void ipfrag_init(void);
1011 1011
1012/*
1013 * IP protocol layer initialiser
1014 */
1015
1016static struct packet_type ip_packet_type = {
1017 .type = __constant_htons(ETH_P_IP),
1018 .func = ip_rcv,
1019};
1020
1012static int __init inet_init(void) 1021static int __init inet_init(void)
1013{ 1022{
1014 struct sk_buff *dummy_skb; 1023 struct sk_buff *dummy_skb;
@@ -1102,6 +1111,8 @@ static int __init inet_init(void)
1102 1111
1103 ipfrag_init(); 1112 ipfrag_init();
1104 1113
1114 dev_add_pack(&ip_packet_type);
1115
1105 rc = 0; 1116 rc = 0;
1106out: 1117out:
1107 return rc; 1118 return rc;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index b56e88edf1b3..4be234c7d8c3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
43 * 2 of the License, or (at your option) any later version. 43 * 2 of the License, or (at your option) any later version.
44 */ 44 */
45 45
46#define VERSION "0.324" 46#define VERSION "0.325"
47 47
48#include <linux/config.h> 48#include <linux/config.h>
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
@@ -136,6 +136,7 @@ struct trie_use_stats {
136 unsigned int semantic_match_passed; 136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss; 137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit; 138 unsigned int null_node_hit;
139 unsigned int resize_node_skipped;
139}; 140};
140#endif 141#endif
141 142
@@ -164,8 +165,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
164static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); 165static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
165static int tnode_child_length(struct tnode *tn); 166static int tnode_child_length(struct tnode *tn);
166static struct node *resize(struct trie *t, struct tnode *tn); 167static struct node *resize(struct trie *t, struct tnode *tn);
167static struct tnode *inflate(struct trie *t, struct tnode *tn); 168static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
168static struct tnode *halve(struct trie *t, struct tnode *tn); 169static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
169static void tnode_free(struct tnode *tn); 170static void tnode_free(struct tnode *tn);
170static void trie_dump_seq(struct seq_file *seq, struct trie *t); 171static void trie_dump_seq(struct seq_file *seq, struct trie *t);
171extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); 172extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
@@ -358,11 +359,32 @@ static inline void free_leaf_info(struct leaf_info *li)
358 kfree(li); 359 kfree(li);
359} 360}
360 361
362static struct tnode *tnode_alloc(unsigned int size)
363{
364 if (size <= PAGE_SIZE) {
365 return kmalloc(size, GFP_KERNEL);
366 } else {
367 return (struct tnode *)
368 __get_free_pages(GFP_KERNEL, get_order(size));
369 }
370}
371
372static void __tnode_free(struct tnode *tn)
373{
374 unsigned int size = sizeof(struct tnode) +
375 (1<<tn->bits) * sizeof(struct node *);
376
377 if (size <= PAGE_SIZE)
378 kfree(tn);
379 else
380 free_pages((unsigned long)tn, get_order(size));
381}
382
361static struct tnode* tnode_new(t_key key, int pos, int bits) 383static struct tnode* tnode_new(t_key key, int pos, int bits)
362{ 384{
363 int nchildren = 1<<bits; 385 int nchildren = 1<<bits;
364 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); 386 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
365 struct tnode *tn = kmalloc(sz, GFP_KERNEL); 387 struct tnode *tn = tnode_alloc(sz);
366 388
367 if(tn) { 389 if(tn) {
368 memset(tn, 0, sz); 390 memset(tn, 0, sz);
@@ -390,7 +412,7 @@ static void tnode_free(struct tnode *tn)
390 printk("FL %p \n", tn); 412 printk("FL %p \n", tn);
391 } 413 }
392 else if(IS_TNODE(tn)) { 414 else if(IS_TNODE(tn)) {
393 kfree(tn); 415 __tnode_free(tn);
394 if(trie_debug > 0 ) 416 if(trie_debug > 0 )
395 printk("FT %p \n", tn); 417 printk("FT %p \n", tn);
396 } 418 }
@@ -460,6 +482,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
460static struct node *resize(struct trie *t, struct tnode *tn) 482static struct node *resize(struct trie *t, struct tnode *tn)
461{ 483{
462 int i; 484 int i;
485 int err = 0;
463 486
464 if (!tn) 487 if (!tn)
465 return NULL; 488 return NULL;
@@ -556,12 +579,20 @@ static struct node *resize(struct trie *t, struct tnode *tn)
556 */ 579 */
557 580
558 check_tnode(tn); 581 check_tnode(tn);
559 582
583 err = 0;
560 while ((tn->full_children > 0 && 584 while ((tn->full_children > 0 &&
561 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= 585 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
562 inflate_threshold * tnode_child_length(tn))) { 586 inflate_threshold * tnode_child_length(tn))) {
563 587
564 tn = inflate(t, tn); 588 tn = inflate(t, tn, &err);
589
590 if(err) {
591#ifdef CONFIG_IP_FIB_TRIE_STATS
592 t->stats.resize_node_skipped++;
593#endif
594 break;
595 }
565 } 596 }
566 597
567 check_tnode(tn); 598 check_tnode(tn);
@@ -570,11 +601,22 @@ static struct node *resize(struct trie *t, struct tnode *tn)
570 * Halve as long as the number of empty children in this 601 * Halve as long as the number of empty children in this
571 * node is above threshold. 602 * node is above threshold.
572 */ 603 */
604
605 err = 0;
573 while (tn->bits > 1 && 606 while (tn->bits > 1 &&
574 100 * (tnode_child_length(tn) - tn->empty_children) < 607 100 * (tnode_child_length(tn) - tn->empty_children) <
575 halve_threshold * tnode_child_length(tn)) 608 halve_threshold * tnode_child_length(tn)) {
609
610 tn = halve(t, tn, &err);
611
612 if(err) {
613#ifdef CONFIG_IP_FIB_TRIE_STATS
614 t->stats.resize_node_skipped++;
615#endif
616 break;
617 }
618 }
576 619
577 tn = halve(t, tn);
578 620
579 /* Only one child remains */ 621 /* Only one child remains */
580 622
@@ -599,7 +641,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
599 return (struct node *) tn; 641 return (struct node *) tn;
600} 642}
601 643
602static struct tnode *inflate(struct trie *t, struct tnode *tn) 644static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
603{ 645{
604 struct tnode *inode; 646 struct tnode *inode;
605 struct tnode *oldtnode = tn; 647 struct tnode *oldtnode = tn;
@@ -611,8 +653,63 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
611 653
612 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); 654 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
613 655
614 if (!tn) 656 if (!tn) {
615 trie_bug("tnode_new failed"); 657 *err = -ENOMEM;
658 return oldtnode;
659 }
660
661 /*
662 * Preallocate and store tnodes before the actual work so we
663 * don't get into an inconsistent state if memory allocation
664 * fails. In case of failure we return the oldnode and inflate
665 * of tnode is ignored.
666 */
667
668 for(i = 0; i < olen; i++) {
669 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
670
671 if (inode &&
672 IS_TNODE(inode) &&
673 inode->pos == oldtnode->pos + oldtnode->bits &&
674 inode->bits > 1) {
675 struct tnode *left, *right;
676
677 t_key m = TKEY_GET_MASK(inode->pos, 1);
678
679 left = tnode_new(inode->key&(~m), inode->pos + 1,
680 inode->bits - 1);
681
682 if(!left) {
683 *err = -ENOMEM;
684 break;
685 }
686
687 right = tnode_new(inode->key|m, inode->pos + 1,
688 inode->bits - 1);
689
690 if(!right) {
691 *err = -ENOMEM;
692 break;
693 }
694
695 put_child(t, tn, 2*i, (struct node *) left);
696 put_child(t, tn, 2*i+1, (struct node *) right);
697 }
698 }
699
700 if(*err) {
701 int size = tnode_child_length(tn);
702 int j;
703
704 for(j = 0; j < size; j++)
705 if( tn->child[j])
706 tnode_free((struct tnode *)tn->child[j]);
707
708 tnode_free(tn);
709
710 *err = -ENOMEM;
711 return oldtnode;
712 }
616 713
617 for(i = 0; i < olen; i++) { 714 for(i = 0; i < olen; i++) {
618 struct node *node = tnode_get_child(oldtnode, i); 715 struct node *node = tnode_get_child(oldtnode, i);
@@ -625,7 +722,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
625 722
626 if(IS_LEAF(node) || ((struct tnode *) node)->pos > 723 if(IS_LEAF(node) || ((struct tnode *) node)->pos >
627 tn->pos + tn->bits - 1) { 724 tn->pos + tn->bits - 1) {
628 if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1, 725 if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
629 1) == 0) 726 1) == 0)
630 put_child(t, tn, 2*i, node); 727 put_child(t, tn, 2*i, node);
631 else 728 else
@@ -665,27 +762,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
665 * the position (inode->pos) 762 * the position (inode->pos)
666 */ 763 */
667 764
668 t_key m = TKEY_GET_MASK(inode->pos, 1);
669
670 /* Use the old key, but set the new significant 765 /* Use the old key, but set the new significant
671 * bit to zero. 766 * bit to zero.
672 */ 767 */
673 left = tnode_new(inode->key&(~m), inode->pos + 1,
674 inode->bits - 1);
675 768
676 if(!left) 769 left = (struct tnode *) tnode_get_child(tn, 2*i);
677 trie_bug("tnode_new failed"); 770 put_child(t, tn, 2*i, NULL);
678 771
679 772 if(!left)
680 /* Use the old key, but set the new significant 773 BUG();
681 * bit to one. 774
682 */ 775 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
683 right = tnode_new(inode->key|m, inode->pos + 1, 776 put_child(t, tn, 2*i+1, NULL);
684 inode->bits - 1); 777
778 if(!right)
779 BUG();
685 780
686 if(!right)
687 trie_bug("tnode_new failed");
688
689 size = tnode_child_length(left); 781 size = tnode_child_length(left);
690 for(j = 0; j < size; j++) { 782 for(j = 0; j < size; j++) {
691 put_child(t, left, j, inode->child[j]); 783 put_child(t, left, j, inode->child[j]);
@@ -701,7 +793,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
701 return tn; 793 return tn;
702} 794}
703 795
704static struct tnode *halve(struct trie *t, struct tnode *tn) 796static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
705{ 797{
706 struct tnode *oldtnode = tn; 798 struct tnode *oldtnode = tn;
707 struct node *left, *right; 799 struct node *left, *right;
@@ -712,8 +804,48 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
712 804
713 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); 805 tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
714 806
715 if(!tn) 807 if (!tn) {
716 trie_bug("tnode_new failed"); 808 *err = -ENOMEM;
809 return oldtnode;
810 }
811
812 /*
813 * Preallocate and store tnodes before the actual work so we
814 * don't get into an inconsistent state if memory allocation
815 * fails. In case of failure we return the oldnode and halve
816 * of tnode is ignored.
817 */
818
819 for(i = 0; i < olen; i += 2) {
820 left = tnode_get_child(oldtnode, i);
821 right = tnode_get_child(oldtnode, i+1);
822
823 /* Two nonempty children */
824 if( left && right) {
825 struct tnode *newBinNode =
826 tnode_new(left->key, tn->pos + tn->bits, 1);
827
828 if(!newBinNode) {
829 *err = -ENOMEM;
830 break;
831 }
832 put_child(t, tn, i/2, (struct node *)newBinNode);
833 }
834 }
835
836 if(*err) {
837 int size = tnode_child_length(tn);
838 int j;
839
840 for(j = 0; j < size; j++)
841 if( tn->child[j])
842 tnode_free((struct tnode *)tn->child[j]);
843
844 tnode_free(tn);
845
846 *err = -ENOMEM;
847 return oldtnode;
848 }
717 849
718 for(i = 0; i < olen; i += 2) { 850 for(i = 0; i < olen; i += 2) {
719 left = tnode_get_child(oldtnode, i); 851 left = tnode_get_child(oldtnode, i);
@@ -730,10 +862,11 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
730 /* Two nonempty children */ 862 /* Two nonempty children */
731 else { 863 else {
732 struct tnode *newBinNode = 864 struct tnode *newBinNode =
733 tnode_new(left->key, tn->pos + tn->bits, 1); 865 (struct tnode *) tnode_get_child(tn, i/2);
866 put_child(t, tn, i/2, NULL);
734 867
735 if(!newBinNode) 868 if(!newBinNode)
736 trie_bug("tnode_new failed"); 869 BUG();
737 870
738 put_child(t, newBinNode, 0, left); 871 put_child(t, newBinNode, 0, left);
739 put_child(t, newBinNode, 1, right); 872 put_child(t, newBinNode, 1, right);
@@ -2301,6 +2434,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
2301 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed); 2434 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2302 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss); 2435 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2303 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit); 2436 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2437 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2304#ifdef CLEAR_STATS 2438#ifdef CLEAR_STATS
2305 memset(&(t->stats), 0, sizeof(t->stats)); 2439 memset(&(t->stats), 0, sizeof(t->stats));
2306#endif 2440#endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index cb759484979d..279f57abfecb 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -970,7 +970,8 @@ int icmp_rcv(struct sk_buff *skb)
970 * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently 970 * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
971 * discarded if to broadcast/multicast. 971 * discarded if to broadcast/multicast.
972 */ 972 */
973 if (icmph->type == ICMP_ECHO && 973 if ((icmph->type == ICMP_ECHO ||
974 icmph->type == ICMP_TIMESTAMP) &&
974 sysctl_icmp_echo_ignore_broadcasts) { 975 sysctl_icmp_echo_ignore_broadcasts) {
975 goto error; 976 goto error;
976 } 977 }
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 1f3183168a90..5088f90835ae 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1615,9 +1615,10 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1615{ 1615{
1616 int err; 1616 int err;
1617 u32 addr = imr->imr_multiaddr.s_addr; 1617 u32 addr = imr->imr_multiaddr.s_addr;
1618 struct ip_mc_socklist *iml, *i; 1618 struct ip_mc_socklist *iml=NULL, *i;
1619 struct in_device *in_dev; 1619 struct in_device *in_dev;
1620 struct inet_sock *inet = inet_sk(sk); 1620 struct inet_sock *inet = inet_sk(sk);
1621 int ifindex;
1621 int count = 0; 1622 int count = 0;
1622 1623
1623 if (!MULTICAST(addr)) 1624 if (!MULTICAST(addr))
@@ -1633,37 +1634,30 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
1633 goto done; 1634 goto done;
1634 } 1635 }
1635 1636
1636 iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
1637
1638 err = -EADDRINUSE; 1637 err = -EADDRINUSE;
1638 ifindex = imr->imr_ifindex;
1639 for (i = inet->mc_list; i; i = i->next) { 1639 for (i = inet->mc_list; i; i = i->next) {
1640 if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) { 1640 if (i->multi.imr_multiaddr.s_addr == addr &&
1641 /* New style additions are reference counted */ 1641 i->multi.imr_ifindex == ifindex)
1642 if (imr->imr_address.s_addr == 0) {
1643 i->count++;
1644 err = 0;
1645 }
1646 goto done; 1642 goto done;
1647 }
1648 count++; 1643 count++;
1649 } 1644 }
1650 err = -ENOBUFS; 1645 err = -ENOBUFS;
1651 if (iml == NULL || count >= sysctl_igmp_max_memberships) 1646 if (count >= sysctl_igmp_max_memberships)
1647 goto done;
1648 iml = (struct ip_mc_socklist *)sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL);
1649 if (iml == NULL)
1652 goto done; 1650 goto done;
1651
1653 memcpy(&iml->multi, imr, sizeof(*imr)); 1652 memcpy(&iml->multi, imr, sizeof(*imr));
1654 iml->next = inet->mc_list; 1653 iml->next = inet->mc_list;
1655 iml->count = 1;
1656 iml->sflist = NULL; 1654 iml->sflist = NULL;
1657 iml->sfmode = MCAST_EXCLUDE; 1655 iml->sfmode = MCAST_EXCLUDE;
1658 inet->mc_list = iml; 1656 inet->mc_list = iml;
1659 ip_mc_inc_group(in_dev, addr); 1657 ip_mc_inc_group(in_dev, addr);
1660 iml = NULL;
1661 err = 0; 1658 err = 0;
1662
1663done: 1659done:
1664 rtnl_shunlock(); 1660 rtnl_shunlock();
1665 if (iml)
1666 sock_kfree_s(sk, iml, sizeof(*iml));
1667 return err; 1661 return err;
1668} 1662}
1669 1663
@@ -1693,30 +1687,25 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
1693{ 1687{
1694 struct inet_sock *inet = inet_sk(sk); 1688 struct inet_sock *inet = inet_sk(sk);
1695 struct ip_mc_socklist *iml, **imlp; 1689 struct ip_mc_socklist *iml, **imlp;
1690 struct in_device *in_dev;
1691 u32 group = imr->imr_multiaddr.s_addr;
1692 u32 ifindex;
1696 1693
1697 rtnl_lock(); 1694 rtnl_lock();
1695 in_dev = ip_mc_find_dev(imr);
1696 if (!in_dev) {
1697 rtnl_unlock();
1698 return -ENODEV;
1699 }
1700 ifindex = imr->imr_ifindex;
1698 for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) { 1701 for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
1699 if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr && 1702 if (iml->multi.imr_multiaddr.s_addr == group &&
1700 iml->multi.imr_address.s_addr==imr->imr_address.s_addr && 1703 iml->multi.imr_ifindex == ifindex) {
1701 (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) { 1704 (void) ip_mc_leave_src(sk, iml, in_dev);
1702 struct in_device *in_dev;
1703
1704 in_dev = inetdev_by_index(iml->multi.imr_ifindex);
1705 if (in_dev)
1706 (void) ip_mc_leave_src(sk, iml, in_dev);
1707 if (--iml->count) {
1708 rtnl_unlock();
1709 if (in_dev)
1710 in_dev_put(in_dev);
1711 return 0;
1712 }
1713 1705
1714 *imlp = iml->next; 1706 *imlp = iml->next;
1715 1707
1716 if (in_dev) { 1708 ip_mc_dec_group(in_dev, group);
1717 ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
1718 in_dev_put(in_dev);
1719 }
1720 rtnl_unlock(); 1709 rtnl_unlock();
1721 sock_kfree_s(sk, iml, sizeof(*iml)); 1710 sock_kfree_s(sk, iml, sizeof(*iml));
1722 return 0; 1711 return 0;
@@ -1736,6 +1725,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1736 struct in_device *in_dev = NULL; 1725 struct in_device *in_dev = NULL;
1737 struct inet_sock *inet = inet_sk(sk); 1726 struct inet_sock *inet = inet_sk(sk);
1738 struct ip_sf_socklist *psl; 1727 struct ip_sf_socklist *psl;
1728 int leavegroup = 0;
1739 int i, j, rv; 1729 int i, j, rv;
1740 1730
1741 if (!MULTICAST(addr)) 1731 if (!MULTICAST(addr))
@@ -1755,15 +1745,20 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1755 err = -EADDRNOTAVAIL; 1745 err = -EADDRNOTAVAIL;
1756 1746
1757 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1747 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1758 if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0) 1748 if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr
1749 && pmc->multi.imr_ifindex == imr.imr_ifindex)
1759 break; 1750 break;
1760 } 1751 }
1761 if (!pmc) /* must have a prior join */ 1752 if (!pmc) { /* must have a prior join */
1753 err = -EINVAL;
1762 goto done; 1754 goto done;
1755 }
1763 /* if a source filter was set, must be the same mode as before */ 1756 /* if a source filter was set, must be the same mode as before */
1764 if (pmc->sflist) { 1757 if (pmc->sflist) {
1765 if (pmc->sfmode != omode) 1758 if (pmc->sfmode != omode) {
1759 err = -EINVAL;
1766 goto done; 1760 goto done;
1761 }
1767 } else if (pmc->sfmode != omode) { 1762 } else if (pmc->sfmode != omode) {
1768 /* allow mode switches for empty-set filters */ 1763 /* allow mode switches for empty-set filters */
1769 ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0); 1764 ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
@@ -1775,7 +1770,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1775 psl = pmc->sflist; 1770 psl = pmc->sflist;
1776 if (!add) { 1771 if (!add) {
1777 if (!psl) 1772 if (!psl)
1778 goto done; 1773 goto done; /* err = -EADDRNOTAVAIL */
1779 rv = !0; 1774 rv = !0;
1780 for (i=0; i<psl->sl_count; i++) { 1775 for (i=0; i<psl->sl_count; i++) {
1781 rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, 1776 rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
@@ -1784,7 +1779,13 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1784 break; 1779 break;
1785 } 1780 }
1786 if (rv) /* source not found */ 1781 if (rv) /* source not found */
1782 goto done; /* err = -EADDRNOTAVAIL */
1783
1784 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
1785 if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
1786 leavegroup = 1;
1787 goto done; 1787 goto done;
1788 }
1788 1789
1789 /* update the interface filter */ 1790 /* update the interface filter */
1790 ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, 1791 ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
@@ -1842,18 +1843,21 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
1842 &mreqs->imr_sourceaddr, 1); 1843 &mreqs->imr_sourceaddr, 1);
1843done: 1844done:
1844 rtnl_shunlock(); 1845 rtnl_shunlock();
1846 if (leavegroup)
1847 return ip_mc_leave_group(sk, &imr);
1845 return err; 1848 return err;
1846} 1849}
1847 1850
1848int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) 1851int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1849{ 1852{
1850 int err; 1853 int err = 0;
1851 struct ip_mreqn imr; 1854 struct ip_mreqn imr;
1852 u32 addr = msf->imsf_multiaddr; 1855 u32 addr = msf->imsf_multiaddr;
1853 struct ip_mc_socklist *pmc; 1856 struct ip_mc_socklist *pmc;
1854 struct in_device *in_dev; 1857 struct in_device *in_dev;
1855 struct inet_sock *inet = inet_sk(sk); 1858 struct inet_sock *inet = inet_sk(sk);
1856 struct ip_sf_socklist *newpsl, *psl; 1859 struct ip_sf_socklist *newpsl, *psl;
1860 int leavegroup = 0;
1857 1861
1858 if (!MULTICAST(addr)) 1862 if (!MULTICAST(addr))
1859 return -EINVAL; 1863 return -EINVAL;
@@ -1872,15 +1876,22 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1872 err = -ENODEV; 1876 err = -ENODEV;
1873 goto done; 1877 goto done;
1874 } 1878 }
1875 err = -EADDRNOTAVAIL; 1879
1880 /* special case - (INCLUDE, empty) == LEAVE_GROUP */
1881 if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
1882 leavegroup = 1;
1883 goto done;
1884 }
1876 1885
1877 for (pmc=inet->mc_list; pmc; pmc=pmc->next) { 1886 for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
1878 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && 1887 if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
1879 pmc->multi.imr_ifindex == imr.imr_ifindex) 1888 pmc->multi.imr_ifindex == imr.imr_ifindex)
1880 break; 1889 break;
1881 } 1890 }
1882 if (!pmc) /* must have a prior join */ 1891 if (!pmc) { /* must have a prior join */
1892 err = -EINVAL;
1883 goto done; 1893 goto done;
1894 }
1884 if (msf->imsf_numsrc) { 1895 if (msf->imsf_numsrc) {
1885 newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk, 1896 newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
1886 IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL); 1897 IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL);
@@ -1909,8 +1920,11 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
1909 0, NULL, 0); 1920 0, NULL, 0);
1910 pmc->sflist = newpsl; 1921 pmc->sflist = newpsl;
1911 pmc->sfmode = msf->imsf_fmode; 1922 pmc->sfmode = msf->imsf_fmode;
1923 err = 0;
1912done: 1924done:
1913 rtnl_shunlock(); 1925 rtnl_shunlock();
1926 if (leavegroup)
1927 err = ip_mc_leave_group(sk, &imr);
1914 return err; 1928 return err;
1915} 1929}
1916 1930
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6ce5c3292f9f..80d13103b2b0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -107,7 +107,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
107 newskb->pkt_type = PACKET_LOOPBACK; 107 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY; 108 newskb->ip_summed = CHECKSUM_UNNECESSARY;
109 BUG_TRAP(newskb->dst); 109 BUG_TRAP(newskb->dst);
110 nf_reset(newskb);
111 netif_rx(newskb); 110 netif_rx(newskb);
112 return 0; 111 return 0;
113} 112}
@@ -188,14 +187,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
188 skb = skb2; 187 skb = skb2;
189 } 188 }
190 189
191#ifdef CONFIG_BRIDGE_NETFILTER
192 /* bridge-netfilter defers calling some IP hooks to the bridge layer
193 * and still needs the conntrack reference.
194 */
195 if (skb->nf_bridge == NULL)
196#endif
197 nf_reset(skb);
198
199 if (hh) { 190 if (hh) {
200 int hh_alen; 191 int hh_alen;
201 192
@@ -389,7 +380,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
389 to->pkt_type = from->pkt_type; 380 to->pkt_type = from->pkt_type;
390 to->priority = from->priority; 381 to->priority = from->priority;
391 to->protocol = from->protocol; 382 to->protocol = from->protocol;
392 to->security = from->security;
393 dst_release(to->dst); 383 dst_release(to->dst);
394 to->dst = dst_clone(from->dst); 384 to->dst = dst_clone(from->dst);
395 to->dev = from->dev; 385 to->dev = from->dev;
@@ -1329,23 +1319,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
1329 ip_rt_put(rt); 1319 ip_rt_put(rt);
1330} 1320}
1331 1321
1332/*
1333 * IP protocol layer initialiser
1334 */
1335
1336static struct packet_type ip_packet_type = {
1337 .type = __constant_htons(ETH_P_IP),
1338 .func = ip_rcv,
1339};
1340
1341/*
1342 * IP registers the packet type and then calls the subprotocol initialisers
1343 */
1344
1345void __init ip_init(void) 1322void __init ip_init(void)
1346{ 1323{
1347 dev_add_pack(&ip_packet_type);
1348
1349 ip_rt_init(); 1324 ip_rt_init();
1350 inet_initpeers(); 1325 inet_initpeers();
1351 1326
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index f8b172f89811..fc7c481d0d79 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -677,11 +677,11 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
677 mreq.imr_address.s_addr = mreqs.imr_interface; 677 mreq.imr_address.s_addr = mreqs.imr_interface;
678 mreq.imr_ifindex = 0; 678 mreq.imr_ifindex = 0;
679 err = ip_mc_join_group(sk, &mreq); 679 err = ip_mc_join_group(sk, &mreq);
680 if (err) 680 if (err && err != -EADDRINUSE)
681 break; 681 break;
682 omode = MCAST_INCLUDE; 682 omode = MCAST_INCLUDE;
683 add = 1; 683 add = 1;
684 } else /*IP_DROP_SOURCE_MEMBERSHIP */ { 684 } else /* IP_DROP_SOURCE_MEMBERSHIP */ {
685 omode = MCAST_INCLUDE; 685 omode = MCAST_INCLUDE;
686 add = 0; 686 add = 0;
687 } 687 }
@@ -754,7 +754,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
754 mreq.imr_address.s_addr = 0; 754 mreq.imr_address.s_addr = 0;
755 mreq.imr_ifindex = greqs.gsr_interface; 755 mreq.imr_ifindex = greqs.gsr_interface;
756 err = ip_mc_join_group(sk, &mreq); 756 err = ip_mc_join_group(sk, &mreq);
757 if (err) 757 if (err && err != -EADDRINUSE)
758 break; 758 break;
759 greqs.gsr_interface = mreq.imr_ifindex; 759 greqs.gsr_interface = mreq.imr_ifindex;
760 omode = MCAST_INCLUDE; 760 omode = MCAST_INCLUDE;
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
index 63a82b4b64bb..c9820bfc493a 100644
--- a/net/ipv4/ipvs/Kconfig
+++ b/net/ipv4/ipvs/Kconfig
@@ -2,11 +2,11 @@
2# IP Virtual Server configuration 2# IP Virtual Server configuration
3# 3#
4menu "IP: Virtual Server Configuration" 4menu "IP: Virtual Server Configuration"
5 depends on INET && NETFILTER 5 depends on NETFILTER
6 6
7config IP_VS 7config IP_VS
8 tristate "IP virtual server support (EXPERIMENTAL)" 8 tristate "IP virtual server support (EXPERIMENTAL)"
9 depends on INET && NETFILTER 9 depends on NETFILTER
10 ---help--- 10 ---help---
11 IP Virtual Server support will let you build a high-performance 11 IP Virtual Server support will let you build a high-performance
12 virtual server based on cluster of two or more real servers. This 12 virtual server based on cluster of two or more real servers. This
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index 9f16ab309106..d0145a8b1551 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -758,7 +758,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
758 return 1; 758 return 1;
759} 759}
760 760
761 761/* Called from keventd and must protect itself from softirqs */
762void ip_vs_random_dropentry(void) 762void ip_vs_random_dropentry(void)
763{ 763{
764 int idx; 764 int idx;
@@ -773,7 +773,7 @@ void ip_vs_random_dropentry(void)
773 /* 773 /*
774 * Lock is actually needed in this loop. 774 * Lock is actually needed in this loop.
775 */ 775 */
776 ct_write_lock(hash); 776 ct_write_lock_bh(hash);
777 777
778 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 778 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
779 if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT)) 779 if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
@@ -806,7 +806,7 @@ void ip_vs_random_dropentry(void)
806 ip_vs_conn_expire_now(cp->control); 806 ip_vs_conn_expire_now(cp->control);
807 } 807 }
808 } 808 }
809 ct_write_unlock(hash); 809 ct_write_unlock_bh(hash);
810 } 810 }
811} 811}
812 812
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 12a82e91d22a..7d99ede2ef79 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -90,7 +90,8 @@ int ip_vs_get_debug_level(void)
90#endif 90#endif
91 91
92/* 92/*
93 * update_defense_level is called from keventd and from sysctl. 93 * update_defense_level is called from keventd and from sysctl,
94 * so it needs to protect itself from softirqs
94 */ 95 */
95static void update_defense_level(void) 96static void update_defense_level(void)
96{ 97{
@@ -110,6 +111,8 @@ static void update_defense_level(void)
110 111
111 nomem = (availmem < sysctl_ip_vs_amemthresh); 112 nomem = (availmem < sysctl_ip_vs_amemthresh);
112 113
114 local_bh_disable();
115
113 /* drop_entry */ 116 /* drop_entry */
114 spin_lock(&__ip_vs_dropentry_lock); 117 spin_lock(&__ip_vs_dropentry_lock);
115 switch (sysctl_ip_vs_drop_entry) { 118 switch (sysctl_ip_vs_drop_entry) {
@@ -206,6 +209,8 @@ static void update_defense_level(void)
206 if (to_change >= 0) 209 if (to_change >= 0)
207 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); 210 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
208 write_unlock(&__ip_vs_securetcp_lock); 211 write_unlock(&__ip_vs_securetcp_lock);
212
213 local_bh_enable();
209} 214}
210 215
211 216
@@ -1360,9 +1365,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
1360 /* Restore the correct value */ 1365 /* Restore the correct value */
1361 *valp = val; 1366 *valp = val;
1362 } else { 1367 } else {
1363 local_bh_disable();
1364 update_defense_level(); 1368 update_defense_level();
1365 local_bh_enable();
1366 } 1369 }
1367 } 1370 }
1368 return rc; 1371 return rc;
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 42dc95102873..1dd824f3cf0a 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -432,6 +432,13 @@ static unsigned int ip_conntrack_defrag(unsigned int hooknum,
432 const struct net_device *out, 432 const struct net_device *out,
433 int (*okfn)(struct sk_buff *)) 433 int (*okfn)(struct sk_buff *))
434{ 434{
435#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
436 /* Previously seen (loopback)? Ignore. Do this before
437 fragment check. */
438 if ((*pskb)->nfct)
439 return NF_ACCEPT;
440#endif
441
435 /* Gather fragments. */ 442 /* Gather fragments. */
436 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { 443 if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
437 *pskb = ip_ct_gather_frags(*pskb, 444 *pskb = ip_ct_gather_frags(*pskb,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 12a1cf306f67..d675ff80b04d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -54,6 +54,7 @@
54 * Marc Boucher : routing by fwmark 54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics 55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file 56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * 58 *
58 * This program is free software; you can redistribute it and/or 59 * This program is free software; you can redistribute it and/or
59 * modify it under the terms of the GNU General Public License 60 * modify it under the terms of the GNU General Public License
@@ -70,6 +71,7 @@
70#include <linux/kernel.h> 71#include <linux/kernel.h>
71#include <linux/sched.h> 72#include <linux/sched.h>
72#include <linux/mm.h> 73#include <linux/mm.h>
74#include <linux/bootmem.h>
73#include <linux/string.h> 75#include <linux/string.h>
74#include <linux/socket.h> 76#include <linux/socket.h>
75#include <linux/sockios.h> 77#include <linux/sockios.h>
@@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = {
201 203
202struct rt_hash_bucket { 204struct rt_hash_bucket {
203 struct rtable *chain; 205 struct rtable *chain;
204 spinlock_t lock; 206};
205} __attribute__((__aligned__(8))); 207#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
208/*
209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210 * The size of this table is a power of two and depends on the number of CPUS.
211 */
212#if NR_CPUS >= 32
213#define RT_HASH_LOCK_SZ 4096
214#elif NR_CPUS >= 16
215#define RT_HASH_LOCK_SZ 2048
216#elif NR_CPUS >= 8
217#define RT_HASH_LOCK_SZ 1024
218#elif NR_CPUS >= 4
219#define RT_HASH_LOCK_SZ 512
220#else
221#define RT_HASH_LOCK_SZ 256
222#endif
223
224static spinlock_t *rt_hash_locks;
225# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
226# define rt_hash_lock_init() { \
227 int i; \
228 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
229 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
230 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
231 spin_lock_init(&rt_hash_locks[i]); \
232 }
233#else
234# define rt_hash_lock_addr(slot) NULL
235# define rt_hash_lock_init()
236#endif
206 237
207static struct rt_hash_bucket *rt_hash_table; 238static struct rt_hash_bucket *rt_hash_table;
208static unsigned rt_hash_mask; 239static unsigned rt_hash_mask;
@@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
575/* This runs via a timer and thus is always in BH context. */ 606/* This runs via a timer and thus is always in BH context. */
576static void rt_check_expire(unsigned long dummy) 607static void rt_check_expire(unsigned long dummy)
577{ 608{
578 static int rover; 609 static unsigned int rover;
579 int i = rover, t; 610 unsigned int i = rover, goal;
580 struct rtable *rth, **rthp; 611 struct rtable *rth, **rthp;
581 unsigned long now = jiffies; 612 unsigned long now = jiffies;
582 613 u64 mult;
583 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; 614
584 t -= ip_rt_gc_timeout) { 615 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
616 if (ip_rt_gc_timeout > 1)
617 do_div(mult, ip_rt_gc_timeout);
618 goal = (unsigned int)mult;
619 if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
620 for (; goal > 0; goal--) {
585 unsigned long tmo = ip_rt_gc_timeout; 621 unsigned long tmo = ip_rt_gc_timeout;
586 622
587 i = (i + 1) & rt_hash_mask; 623 i = (i + 1) & rt_hash_mask;
588 rthp = &rt_hash_table[i].chain; 624 rthp = &rt_hash_table[i].chain;
589 625
590 spin_lock(&rt_hash_table[i].lock); 626 if (*rthp == 0)
627 continue;
628 spin_lock(rt_hash_lock_addr(i));
591 while ((rth = *rthp) != NULL) { 629 while ((rth = *rthp) != NULL) {
592 if (rth->u.dst.expires) { 630 if (rth->u.dst.expires) {
593 /* Entry is expired even if it is in use */ 631 /* Entry is expired even if it is in use */
@@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy)
620 rt_free(rth); 658 rt_free(rth);
621#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 659#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
622 } 660 }
623 spin_unlock(&rt_hash_table[i].lock); 661 spin_unlock(rt_hash_lock_addr(i));
624 662
625 /* Fallback loop breaker. */ 663 /* Fallback loop breaker. */
626 if (time_after(jiffies, now)) 664 if (time_after(jiffies, now))
627 break; 665 break;
628 } 666 }
629 rover = i; 667 rover = i;
630 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); 668 mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
631} 669}
632 670
633/* This can run from both BH and non-BH contexts, the latter 671/* This can run from both BH and non-BH contexts, the latter
@@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy)
643 get_random_bytes(&rt_hash_rnd, 4); 681 get_random_bytes(&rt_hash_rnd, 4);
644 682
645 for (i = rt_hash_mask; i >= 0; i--) { 683 for (i = rt_hash_mask; i >= 0; i--) {
646 spin_lock_bh(&rt_hash_table[i].lock); 684 spin_lock_bh(rt_hash_lock_addr(i));
647 rth = rt_hash_table[i].chain; 685 rth = rt_hash_table[i].chain;
648 if (rth) 686 if (rth)
649 rt_hash_table[i].chain = NULL; 687 rt_hash_table[i].chain = NULL;
650 spin_unlock_bh(&rt_hash_table[i].lock); 688 spin_unlock_bh(rt_hash_lock_addr(i));
651 689
652 for (; rth; rth = next) { 690 for (; rth; rth = next) {
653 next = rth->u.rt_next; 691 next = rth->u.rt_next;
@@ -780,7 +818,7 @@ static int rt_garbage_collect(void)
780 818
781 k = (k + 1) & rt_hash_mask; 819 k = (k + 1) & rt_hash_mask;
782 rthp = &rt_hash_table[k].chain; 820 rthp = &rt_hash_table[k].chain;
783 spin_lock_bh(&rt_hash_table[k].lock); 821 spin_lock_bh(rt_hash_lock_addr(k));
784 while ((rth = *rthp) != NULL) { 822 while ((rth = *rthp) != NULL) {
785 if (!rt_may_expire(rth, tmo, expire)) { 823 if (!rt_may_expire(rth, tmo, expire)) {
786 tmo >>= 1; 824 tmo >>= 1;
@@ -812,7 +850,7 @@ static int rt_garbage_collect(void)
812 goal--; 850 goal--;
813#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 851#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
814 } 852 }
815 spin_unlock_bh(&rt_hash_table[k].lock); 853 spin_unlock_bh(rt_hash_lock_addr(k));
816 if (goal <= 0) 854 if (goal <= 0)
817 break; 855 break;
818 } 856 }
@@ -882,7 +920,7 @@ restart:
882 920
883 rthp = &rt_hash_table[hash].chain; 921 rthp = &rt_hash_table[hash].chain;
884 922
885 spin_lock_bh(&rt_hash_table[hash].lock); 923 spin_lock_bh(rt_hash_lock_addr(hash));
886 while ((rth = *rthp) != NULL) { 924 while ((rth = *rthp) != NULL) {
887#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 925#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
888 if (!(rth->u.dst.flags & DST_BALANCED) && 926 if (!(rth->u.dst.flags & DST_BALANCED) &&
@@ -908,7 +946,7 @@ restart:
908 rth->u.dst.__use++; 946 rth->u.dst.__use++;
909 dst_hold(&rth->u.dst); 947 dst_hold(&rth->u.dst);
910 rth->u.dst.lastuse = now; 948 rth->u.dst.lastuse = now;
911 spin_unlock_bh(&rt_hash_table[hash].lock); 949 spin_unlock_bh(rt_hash_lock_addr(hash));
912 950
913 rt_drop(rt); 951 rt_drop(rt);
914 *rp = rth; 952 *rp = rth;
@@ -949,7 +987,7 @@ restart:
949 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 987 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
950 int err = arp_bind_neighbour(&rt->u.dst); 988 int err = arp_bind_neighbour(&rt->u.dst);
951 if (err) { 989 if (err) {
952 spin_unlock_bh(&rt_hash_table[hash].lock); 990 spin_unlock_bh(rt_hash_lock_addr(hash));
953 991
954 if (err != -ENOBUFS) { 992 if (err != -ENOBUFS) {
955 rt_drop(rt); 993 rt_drop(rt);
@@ -990,7 +1028,7 @@ restart:
990 } 1028 }
991#endif 1029#endif
992 rt_hash_table[hash].chain = rt; 1030 rt_hash_table[hash].chain = rt;
993 spin_unlock_bh(&rt_hash_table[hash].lock); 1031 spin_unlock_bh(rt_hash_lock_addr(hash));
994 *rp = rt; 1032 *rp = rt;
995 return 0; 1033 return 0;
996} 1034}
@@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1058{ 1096{
1059 struct rtable **rthp; 1097 struct rtable **rthp;
1060 1098
1061 spin_lock_bh(&rt_hash_table[hash].lock); 1099 spin_lock_bh(rt_hash_lock_addr(hash));
1062 ip_rt_put(rt); 1100 ip_rt_put(rt);
1063 for (rthp = &rt_hash_table[hash].chain; *rthp; 1101 for (rthp = &rt_hash_table[hash].chain; *rthp;
1064 rthp = &(*rthp)->u.rt_next) 1102 rthp = &(*rthp)->u.rt_next)
@@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
1067 rt_free(rt); 1105 rt_free(rt);
1068 break; 1106 break;
1069 } 1107 }
1070 spin_unlock_bh(&rt_hash_table[hash].lock); 1108 spin_unlock_bh(rt_hash_lock_addr(hash));
1071} 1109}
1072 1110
1073void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, 1111void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -1647,7 +1685,7 @@ static void ip_handle_martian_source(struct net_device *dev,
1647 printk(KERN_WARNING "martian source %u.%u.%u.%u from " 1685 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1648 "%u.%u.%u.%u, on dev %s\n", 1686 "%u.%u.%u.%u, on dev %s\n",
1649 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 1687 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1650 if (dev->hard_header_len) { 1688 if (dev->hard_header_len && skb->mac.raw) {
1651 int i; 1689 int i;
1652 unsigned char *p = skb->mac.raw; 1690 unsigned char *p = skb->mac.raw;
1653 printk(KERN_WARNING "ll header: "); 1691 printk(KERN_WARNING "ll header: ");
@@ -3073,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries);
3073 3111
3074int __init ip_rt_init(void) 3112int __init ip_rt_init(void)
3075{ 3113{
3076 int i, order, goal, rc = 0; 3114 int rc = 0;
3077 3115
3078 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ 3116 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3079 (jiffies ^ (jiffies >> 7))); 3117 (jiffies ^ (jiffies >> 7)));
3080 3118
3081#ifdef CONFIG_NET_CLS_ROUTE 3119#ifdef CONFIG_NET_CLS_ROUTE
3120 {
3121 int order;
3082 for (order = 0; 3122 for (order = 0;
3083 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) 3123 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
3084 /* NOTHING */; 3124 /* NOTHING */;
@@ -3086,6 +3126,7 @@ int __init ip_rt_init(void)
3086 if (!ip_rt_acct) 3126 if (!ip_rt_acct)
3087 panic("IP: failed to allocate ip_rt_acct\n"); 3127 panic("IP: failed to allocate ip_rt_acct\n");
3088 memset(ip_rt_acct, 0, PAGE_SIZE << order); 3128 memset(ip_rt_acct, 0, PAGE_SIZE << order);
3129 }
3089#endif 3130#endif
3090 3131
3091 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", 3132 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -3096,36 +3137,19 @@ int __init ip_rt_init(void)
3096 if (!ipv4_dst_ops.kmem_cachep) 3137 if (!ipv4_dst_ops.kmem_cachep)
3097 panic("IP: failed to allocate ip_dst_cache\n"); 3138 panic("IP: failed to allocate ip_dst_cache\n");
3098 3139
3099 goal = num_physpages >> (26 - PAGE_SHIFT); 3140 rt_hash_table = (struct rt_hash_bucket *)
3100 if (rhash_entries) 3141 alloc_large_system_hash("IP route cache",
3101 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; 3142 sizeof(struct rt_hash_bucket),
3102 for (order = 0; (1UL << order) < goal; order++) 3143 rhash_entries,
3103 /* NOTHING */; 3144 (num_physpages >= 128 * 1024) ?
3104 3145 (27 - PAGE_SHIFT) :
3105 do { 3146 (29 - PAGE_SHIFT),
3106 rt_hash_mask = (1UL << order) * PAGE_SIZE / 3147 HASH_HIGHMEM,
3107 sizeof(struct rt_hash_bucket); 3148 &rt_hash_log,
3108 while (rt_hash_mask & (rt_hash_mask - 1)) 3149 &rt_hash_mask,
3109 rt_hash_mask--; 3150 0);
3110 rt_hash_table = (struct rt_hash_bucket *) 3151 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3111 __get_free_pages(GFP_ATOMIC, order); 3152 rt_hash_lock_init();
3112 } while (rt_hash_table == NULL && --order > 0);
3113
3114 if (!rt_hash_table)
3115 panic("Failed to allocate IP route cache hash table\n");
3116
3117 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
3118 rt_hash_mask,
3119 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
3120
3121 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
3122 /* NOTHING */;
3123
3124 rt_hash_mask--;
3125 for (i = 0; i <= rt_hash_mask; i++) {
3126 spin_lock_init(&rt_hash_table[i].lock);
3127 rt_hash_table[i].chain = NULL;
3128 }
3129 3153
3130 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3154 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3131 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3155 ip_rt_max_size = (rt_hash_mask + 1) * 16;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 882436da9a3a..ddb6ce4ecff2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
615 size_t psize, int flags) 615 size_t psize, int flags)
616{ 616{
617 struct tcp_sock *tp = tcp_sk(sk); 617 struct tcp_sock *tp = tcp_sk(sk);
618 int mss_now; 618 int mss_now, size_goal;
619 int err; 619 int err;
620 ssize_t copied; 620 ssize_t copied;
621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 621 long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 628 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
629 629
630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 630 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
631 size_goal = tp->xmit_size_goal;
631 copied = 0; 632 copied = 0;
632 633
633 err = -EPIPE; 634 err = -EPIPE;
@@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
641 int offset = poffset % PAGE_SIZE; 642 int offset = poffset % PAGE_SIZE;
642 int size = min_t(size_t, psize, PAGE_SIZE - offset); 643 int size = min_t(size_t, psize, PAGE_SIZE - offset);
643 644
644 if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) { 645 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
645new_segment: 646new_segment:
646 if (!sk_stream_memory_free(sk)) 647 if (!sk_stream_memory_free(sk))
647 goto wait_for_sndbuf; 648 goto wait_for_sndbuf;
@@ -652,7 +653,7 @@ new_segment:
652 goto wait_for_memory; 653 goto wait_for_memory;
653 654
654 skb_entail(sk, tp, skb); 655 skb_entail(sk, tp, skb);
655 copy = mss_now; 656 copy = size_goal;
656 } 657 }
657 658
658 if (copy > size) 659 if (copy > size)
@@ -693,7 +694,7 @@ new_segment:
693 if (!(psize -= copy)) 694 if (!(psize -= copy))
694 goto out; 695 goto out;
695 696
696 if (skb->len != mss_now || (flags & MSG_OOB)) 697 if (skb->len < mss_now || (flags & MSG_OOB))
697 continue; 698 continue;
698 699
699 if (forced_push(tp)) { 700 if (forced_push(tp)) {
@@ -713,6 +714,7 @@ wait_for_memory:
713 goto do_error; 714 goto do_error;
714 715
715 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 716 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
717 size_goal = tp->xmit_size_goal;
716 } 718 }
717 719
718out: 720out:
@@ -754,15 +756,20 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
754 756
755static inline int select_size(struct sock *sk, struct tcp_sock *tp) 757static inline int select_size(struct sock *sk, struct tcp_sock *tp)
756{ 758{
757 int tmp = tp->mss_cache_std; 759 int tmp = tp->mss_cache;
758 760
759 if (sk->sk_route_caps & NETIF_F_SG) { 761 if (sk->sk_route_caps & NETIF_F_SG) {
760 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); 762 if (sk->sk_route_caps & NETIF_F_TSO)
763 tmp = 0;
764 else {
765 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
761 766
762 if (tmp >= pgbreak && 767 if (tmp >= pgbreak &&
763 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) 768 tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
764 tmp = pgbreak; 769 tmp = pgbreak;
770 }
765 } 771 }
772
766 return tmp; 773 return tmp;
767} 774}
768 775
@@ -773,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
773 struct tcp_sock *tp = tcp_sk(sk); 780 struct tcp_sock *tp = tcp_sk(sk);
774 struct sk_buff *skb; 781 struct sk_buff *skb;
775 int iovlen, flags; 782 int iovlen, flags;
776 int mss_now; 783 int mss_now, size_goal;
777 int err, copied; 784 int err, copied;
778 long timeo; 785 long timeo;
779 786
@@ -792,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
792 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 799 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
793 800
794 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 801 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
802 size_goal = tp->xmit_size_goal;
795 803
796 /* Ok commence sending. */ 804 /* Ok commence sending. */
797 iovlen = msg->msg_iovlen; 805 iovlen = msg->msg_iovlen;
@@ -814,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
814 skb = sk->sk_write_queue.prev; 822 skb = sk->sk_write_queue.prev;
815 823
816 if (!sk->sk_send_head || 824 if (!sk->sk_send_head ||
817 (copy = mss_now - skb->len) <= 0) { 825 (copy = size_goal - skb->len) <= 0) {
818 826
819new_segment: 827new_segment:
820 /* Allocate new segment. If the interface is SG, 828 /* Allocate new segment. If the interface is SG,
@@ -837,7 +845,7 @@ new_segment:
837 skb->ip_summed = CHECKSUM_HW; 845 skb->ip_summed = CHECKSUM_HW;
838 846
839 skb_entail(sk, tp, skb); 847 skb_entail(sk, tp, skb);
840 copy = mss_now; 848 copy = size_goal;
841 } 849 }
842 850
843 /* Try to append data to the end of skb. */ 851 /* Try to append data to the end of skb. */
@@ -872,11 +880,6 @@ new_segment:
872 tcp_mark_push(tp, skb); 880 tcp_mark_push(tp, skb);
873 goto new_segment; 881 goto new_segment;
874 } else if (page) { 882 } else if (page) {
875 /* If page is cached, align
876 * offset to L1 cache boundary
877 */
878 off = (off + L1_CACHE_BYTES - 1) &
879 ~(L1_CACHE_BYTES - 1);
880 if (off == PAGE_SIZE) { 883 if (off == PAGE_SIZE) {
881 put_page(page); 884 put_page(page);
882 TCP_PAGE(sk) = page = NULL; 885 TCP_PAGE(sk) = page = NULL;
@@ -937,7 +940,7 @@ new_segment:
937 if ((seglen -= copy) == 0 && iovlen == 0) 940 if ((seglen -= copy) == 0 && iovlen == 0)
938 goto out; 941 goto out;
939 942
940 if (skb->len != mss_now || (flags & MSG_OOB)) 943 if (skb->len < mss_now || (flags & MSG_OOB))
941 continue; 944 continue;
942 945
943 if (forced_push(tp)) { 946 if (forced_push(tp)) {
@@ -957,6 +960,7 @@ wait_for_memory:
957 goto do_error; 960 goto do_error;
958 961
959 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 962 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
963 size_goal = tp->xmit_size_goal;
960 } 964 }
961 } 965 }
962 966
@@ -1101,7 +1105,7 @@ static void tcp_prequeue_process(struct sock *sk)
1101 struct sk_buff *skb; 1105 struct sk_buff *skb;
1102 struct tcp_sock *tp = tcp_sk(sk); 1106 struct tcp_sock *tp = tcp_sk(sk);
1103 1107
1104 NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue)); 1108 NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1105 1109
1106 /* RX process wants to run with disabled BHs, though it is not 1110 /* RX process wants to run with disabled BHs, though it is not
1107 * necessary */ 1111 * necessary */
@@ -1365,7 +1369,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1365 * is not empty. It is more elegant, but eats cycles, 1369 * is not empty. It is more elegant, but eats cycles,
1366 * unfortunately. 1370 * unfortunately.
1367 */ 1371 */
1368 if (skb_queue_len(&tp->ucopy.prequeue)) 1372 if (!skb_queue_empty(&tp->ucopy.prequeue))
1369 goto do_prequeue; 1373 goto do_prequeue;
1370 1374
1371 /* __ Set realtime policy in scheduler __ */ 1375 /* __ Set realtime policy in scheduler __ */
@@ -1390,7 +1394,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1390 } 1394 }
1391 1395
1392 if (tp->rcv_nxt == tp->copied_seq && 1396 if (tp->rcv_nxt == tp->copied_seq &&
1393 skb_queue_len(&tp->ucopy.prequeue)) { 1397 !skb_queue_empty(&tp->ucopy.prequeue)) {
1394do_prequeue: 1398do_prequeue:
1395 tcp_prequeue_process(sk); 1399 tcp_prequeue_process(sk);
1396 1400
@@ -1472,7 +1476,7 @@ skip_copy:
1472 } while (len > 0); 1476 } while (len > 0);
1473 1477
1474 if (user_recv) { 1478 if (user_recv) {
1475 if (skb_queue_len(&tp->ucopy.prequeue)) { 1479 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1476 int chunk; 1480 int chunk;
1477 1481
1478 tp->ucopy.len = copied > 0 ? len : 0; 1482 tp->ucopy.len = copied > 0 ? len : 0;
@@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
2128 2132
2129 info->tcpi_rto = jiffies_to_usecs(tp->rto); 2133 info->tcpi_rto = jiffies_to_usecs(tp->rto);
2130 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); 2134 info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
2131 info->tcpi_snd_mss = tp->mss_cache_std; 2135 info->tcpi_snd_mss = tp->mss_cache;
2132 info->tcpi_rcv_mss = tp->ack.rcv_mss; 2136 info->tcpi_rcv_mss = tp->ack.rcv_mss;
2133 2137
2134 info->tcpi_unacked = tp->packets_out; 2138 info->tcpi_unacked = tp->packets_out;
@@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2178 2182
2179 switch (optname) { 2183 switch (optname) {
2180 case TCP_MAXSEG: 2184 case TCP_MAXSEG:
2181 val = tp->mss_cache_std; 2185 val = tp->mss_cache;
2182 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) 2186 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2183 val = tp->rx_opt.user_mss; 2187 val = tp->rx_opt.user_mss;
2184 break; 2188 break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7bbbbc33eb4b..53a8a5399f1e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
740 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 740 __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
741 741
742 if (!cwnd) { 742 if (!cwnd) {
743 if (tp->mss_cache_std > 1460) 743 if (tp->mss_cache > 1460)
744 cwnd = 2; 744 cwnd = 2;
745 else 745 else
746 cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; 746 cwnd = (tp->mss_cache > 1095) ? 3 : 4;
747 } 747 }
748 return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 748 return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
749} 749}
@@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
914 if (sk->sk_route_caps & NETIF_F_TSO) { 914 if (sk->sk_route_caps & NETIF_F_TSO) {
915 sk->sk_route_caps &= ~NETIF_F_TSO; 915 sk->sk_route_caps &= ~NETIF_F_TSO;
916 sock_set_flag(sk, SOCK_NO_LARGESEND); 916 sock_set_flag(sk, SOCK_NO_LARGESEND);
917 tp->mss_cache = tp->mss_cache_std; 917 tp->mss_cache = tp->mss_cache;
918 } 918 }
919 919
920 if (!tp->sacked_out) 920 if (!tp->sacked_out)
@@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
1077 (IsFack(tp) || 1077 (IsFack(tp) ||
1078 !before(lost_retrans, 1078 !before(lost_retrans,
1079 TCP_SKB_CB(skb)->ack_seq + tp->reordering * 1079 TCP_SKB_CB(skb)->ack_seq + tp->reordering *
1080 tp->mss_cache_std))) { 1080 tp->mss_cache))) {
1081 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1081 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1082 tp->retrans_out -= tcp_skb_pcount(skb); 1082 tp->retrans_out -= tcp_skb_pcount(skb);
1083 1083
@@ -1957,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
1957 } 1957 }
1958} 1958}
1959 1959
1960/* There is one downside to this scheme. Although we keep the
1961 * ACK clock ticking, adjusting packet counters and advancing
1962 * congestion window, we do not liberate socket send buffer
1963 * space.
1964 *
1965 * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
1966 * then making a write space wakeup callback is a possible
1967 * future enhancement. WARNING: it is not trivial to make.
1968 */
1969static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, 1960static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
1970 __u32 now, __s32 *seq_rtt) 1961 __u32 now, __s32 *seq_rtt)
1971{ 1962{
@@ -2047,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt
2047 * the other end. 2038 * the other end.
2048 */ 2039 */
2049 if (after(scb->end_seq, tp->snd_una)) { 2040 if (after(scb->end_seq, tp->snd_una)) {
2050 if (tcp_skb_pcount(skb) > 1) 2041 if (tcp_skb_pcount(skb) > 1 &&
2042 after(tp->snd_una, scb->seq))
2051 acked |= tcp_tso_acked(sk, skb, 2043 acked |= tcp_tso_acked(sk, skb,
2052 now, &seq_rtt); 2044 now, &seq_rtt);
2053 break; 2045 break;
@@ -2810,7 +2802,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
2810 int this_sack; 2802 int this_sack;
2811 2803
2812 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ 2804 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
2813 if (skb_queue_len(&tp->out_of_order_queue) == 0) { 2805 if (skb_queue_empty(&tp->out_of_order_queue)) {
2814 tp->rx_opt.num_sacks = 0; 2806 tp->rx_opt.num_sacks = 0;
2815 tp->rx_opt.eff_sacks = tp->rx_opt.dsack; 2807 tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
2816 return; 2808 return;
@@ -2943,13 +2935,13 @@ queue_and_out:
2943 if(th->fin) 2935 if(th->fin)
2944 tcp_fin(skb, sk, th); 2936 tcp_fin(skb, sk, th);
2945 2937
2946 if (skb_queue_len(&tp->out_of_order_queue)) { 2938 if (!skb_queue_empty(&tp->out_of_order_queue)) {
2947 tcp_ofo_queue(sk); 2939 tcp_ofo_queue(sk);
2948 2940
2949 /* RFC2581. 4.2. SHOULD send immediate ACK, when 2941 /* RFC2581. 4.2. SHOULD send immediate ACK, when
2950 * gap in queue is filled. 2942 * gap in queue is filled.
2951 */ 2943 */
2952 if (!skb_queue_len(&tp->out_of_order_queue)) 2944 if (skb_queue_empty(&tp->out_of_order_queue))
2953 tp->ack.pingpong = 0; 2945 tp->ack.pingpong = 0;
2954 } 2946 }
2955 2947
@@ -3257,9 +3249,8 @@ static int tcp_prune_queue(struct sock *sk)
3257 * This must not ever occur. */ 3249 * This must not ever occur. */
3258 3250
3259 /* First, purge the out_of_order queue. */ 3251 /* First, purge the out_of_order queue. */
3260 if (skb_queue_len(&tp->out_of_order_queue)) { 3252 if (!skb_queue_empty(&tp->out_of_order_queue)) {
3261 NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED, 3253 NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
3262 skb_queue_len(&tp->out_of_order_queue));
3263 __skb_queue_purge(&tp->out_of_order_queue); 3254 __skb_queue_purge(&tp->out_of_order_queue);
3264 3255
3265 /* Reset SACK state. A conforming SACK implementation will 3256 /* Reset SACK state. A conforming SACK implementation will
@@ -3308,6 +3299,28 @@ void tcp_cwnd_application_limited(struct sock *sk)
3308 tp->snd_cwnd_stamp = tcp_time_stamp; 3299 tp->snd_cwnd_stamp = tcp_time_stamp;
3309} 3300}
3310 3301
3302static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
3303{
3304 /* If the user specified a specific send buffer setting, do
3305 * not modify it.
3306 */
3307 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
3308 return 0;
3309
3310 /* If we are under global TCP memory pressure, do not expand. */
3311 if (tcp_memory_pressure)
3312 return 0;
3313
3314 /* If we are under soft global TCP memory pressure, do not expand. */
3315 if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
3316 return 0;
3317
3318 /* If we filled the congestion window, do not expand. */
3319 if (tp->packets_out >= tp->snd_cwnd)
3320 return 0;
3321
3322 return 1;
3323}
3311 3324
3312/* When incoming ACK allowed to free some skb from write_queue, 3325/* When incoming ACK allowed to free some skb from write_queue,
3313 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket 3326 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
@@ -3319,11 +3332,8 @@ static void tcp_new_space(struct sock *sk)
3319{ 3332{
3320 struct tcp_sock *tp = tcp_sk(sk); 3333 struct tcp_sock *tp = tcp_sk(sk);
3321 3334
3322 if (tp->packets_out < tp->snd_cwnd && 3335 if (tcp_should_expand_sndbuf(sk, tp)) {
3323 !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && 3336 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
3324 !tcp_memory_pressure &&
3325 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
3326 int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
3327 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), 3337 MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
3328 demanded = max_t(unsigned int, tp->snd_cwnd, 3338 demanded = max_t(unsigned int, tp->snd_cwnd,
3329 tp->reordering + 1); 3339 tp->reordering + 1);
@@ -3346,22 +3356,9 @@ static inline void tcp_check_space(struct sock *sk)
3346 } 3356 }
3347} 3357}
3348 3358
3349static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) 3359static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
3350{
3351 struct tcp_sock *tp = tcp_sk(sk);
3352
3353 if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
3354 tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
3355 tcp_write_xmit(sk, tp->nonagle))
3356 tcp_check_probe_timer(sk, tp);
3357}
3358
3359static __inline__ void tcp_data_snd_check(struct sock *sk)
3360{ 3360{
3361 struct sk_buff *skb = sk->sk_send_head; 3361 tcp_push_pending_frames(sk, tp);
3362
3363 if (skb != NULL)
3364 __tcp_data_snd_check(sk, skb);
3365 tcp_check_space(sk); 3362 tcp_check_space(sk);
3366} 3363}
3367 3364
@@ -3655,7 +3652,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3655 */ 3652 */
3656 tcp_ack(sk, skb, 0); 3653 tcp_ack(sk, skb, 0);
3657 __kfree_skb(skb); 3654 __kfree_skb(skb);
3658 tcp_data_snd_check(sk); 3655 tcp_data_snd_check(sk, tp);
3659 return 0; 3656 return 0;
3660 } else { /* Header too small */ 3657 } else { /* Header too small */
3661 TCP_INC_STATS_BH(TCP_MIB_INERRS); 3658 TCP_INC_STATS_BH(TCP_MIB_INERRS);
@@ -3721,7 +3718,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3721 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { 3718 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
3722 /* Well, only one small jumplet in fast path... */ 3719 /* Well, only one small jumplet in fast path... */
3723 tcp_ack(sk, skb, FLAG_DATA); 3720 tcp_ack(sk, skb, FLAG_DATA);
3724 tcp_data_snd_check(sk); 3721 tcp_data_snd_check(sk, tp);
3725 if (!tcp_ack_scheduled(tp)) 3722 if (!tcp_ack_scheduled(tp))
3726 goto no_ack; 3723 goto no_ack;
3727 } 3724 }
@@ -3799,7 +3796,7 @@ step5:
3799 /* step 7: process the segment text */ 3796 /* step 7: process the segment text */
3800 tcp_data_queue(sk, skb); 3797 tcp_data_queue(sk, skb);
3801 3798
3802 tcp_data_snd_check(sk); 3799 tcp_data_snd_check(sk, tp);
3803 tcp_ack_snd_check(sk); 3800 tcp_ack_snd_check(sk);
3804 return 0; 3801 return 0;
3805 3802
@@ -4109,7 +4106,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4109 /* Do step6 onward by hand. */ 4106 /* Do step6 onward by hand. */
4110 tcp_urg(sk, skb, th); 4107 tcp_urg(sk, skb, th);
4111 __kfree_skb(skb); 4108 __kfree_skb(skb);
4112 tcp_data_snd_check(sk); 4109 tcp_data_snd_check(sk, tp);
4113 return 0; 4110 return 0;
4114 } 4111 }
4115 4112
@@ -4300,7 +4297,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4300 4297
4301 /* tcp_data could move socket to TIME-WAIT */ 4298 /* tcp_data could move socket to TIME-WAIT */
4302 if (sk->sk_state != TCP_CLOSE) { 4299 if (sk->sk_state != TCP_CLOSE) {
4303 tcp_data_snd_check(sk); 4300 tcp_data_snd_check(sk, tp);
4304 tcp_ack_snd_check(sk); 4301 tcp_ack_snd_check(sk);
4305 } 4302 }
4306 4303
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ebf112347a97..62f62bb05c2a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk)
2045 */ 2045 */
2046 tp->snd_ssthresh = 0x7fffffff; /* Infinity */ 2046 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2047 tp->snd_cwnd_clamp = ~0; 2047 tp->snd_cwnd_clamp = ~0;
2048 tp->mss_cache_std = tp->mss_cache = 536; 2048 tp->mss_cache = 536;
2049 2049
2050 tp->reordering = sysctl_tcp_reordering; 2050 tp->reordering = sysctl_tcp_reordering;
2051 tp->ca_ops = &tcp_init_congestion_ops; 2051 tp->ca_ops = &tcp_init_congestion_ops;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0e17c244875c..e3f8ea1bfa9c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
49 * will allow a single TSO frame to consume. Building TSO frames 49 * will allow a single TSO frame to consume. Building TSO frames
50 * which are too large can cause TCP streams to be bursty. 50 * which are too large can cause TCP streams to be bursty.
51 */ 51 */
52int sysctl_tcp_tso_win_divisor = 8; 52int sysctl_tcp_tso_win_divisor = 3;
53 53
54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, 54static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 struct sk_buff *skb) 55 struct sk_buff *skb)
@@ -140,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
140 tp->ack.pingpong = 1; 140 tp->ack.pingpong = 1;
141} 141}
142 142
143static __inline__ void tcp_event_ack_sent(struct sock *sk) 143static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
144{ 144{
145 struct tcp_sock *tp = tcp_sk(sk); 145 struct tcp_sock *tp = tcp_sk(sk);
146 146
147 tcp_dec_quickack_mode(tp); 147 tcp_dec_quickack_mode(tp, pkts);
148 tcp_clear_xmit_timer(sk, TCP_TIME_DACK); 148 tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
149} 149}
150 150
@@ -355,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
355 tp->af_specific->send_check(sk, th, skb->len, skb); 355 tp->af_specific->send_check(sk, th, skb->len, skb);
356 356
357 if (tcb->flags & TCPCB_FLAG_ACK) 357 if (tcb->flags & TCPCB_FLAG_ACK)
358 tcp_event_ack_sent(sk); 358 tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
359 359
360 if (skb->len != tcp_header_size) 360 if (skb->len != tcp_header_size)
361 tcp_event_data_sent(tp, skb, sk); 361 tcp_event_data_sent(tp, skb, sk);
@@ -403,42 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
403 sk->sk_send_head = skb; 403 sk->sk_send_head = skb;
404} 404}
405 405
406static inline void tcp_tso_set_push(struct sk_buff *skb) 406static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
407{
408 /* Force push to be on for any TSO frames to workaround
409 * problems with busted implementations like Mac OS-X that
410 * hold off socket receive wakeups until push is seen.
411 */
412 if (tcp_skb_pcount(skb) > 1)
413 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
414}
415
416/* Send _single_ skb sitting at the send head. This function requires
417 * true push pending frames to setup probe timer etc.
418 */
419void tcp_push_one(struct sock *sk, unsigned cur_mss)
420{ 407{
421 struct tcp_sock *tp = tcp_sk(sk); 408 struct tcp_sock *tp = tcp_sk(sk);
422 struct sk_buff *skb = sk->sk_send_head;
423 409
424 if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) { 410 if (skb->len <= tp->mss_cache ||
425 /* Send it out now. */
426 TCP_SKB_CB(skb)->when = tcp_time_stamp;
427 tcp_tso_set_push(skb);
428 if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
429 sk->sk_send_head = NULL;
430 tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
431 tcp_packets_out_inc(sk, tp, skb);
432 return;
433 }
434 }
435}
436
437void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
438{
439 struct tcp_sock *tp = tcp_sk(sk);
440
441 if (skb->len <= tp->mss_cache_std ||
442 !(sk->sk_route_caps & NETIF_F_TSO)) { 411 !(sk->sk_route_caps & NETIF_F_TSO)) {
443 /* Avoid the costly divide in the normal 412 /* Avoid the costly divide in the normal
444 * non-TSO case. 413 * non-TSO case.
@@ -448,10 +417,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
448 } else { 417 } else {
449 unsigned int factor; 418 unsigned int factor;
450 419
451 factor = skb->len + (tp->mss_cache_std - 1); 420 factor = skb->len + (tp->mss_cache - 1);
452 factor /= tp->mss_cache_std; 421 factor /= tp->mss_cache;
453 skb_shinfo(skb)->tso_segs = factor; 422 skb_shinfo(skb)->tso_segs = factor;
454 skb_shinfo(skb)->tso_size = tp->mss_cache_std; 423 skb_shinfo(skb)->tso_size = tp->mss_cache;
455 } 424 }
456} 425}
457 426
@@ -537,6 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
537 } 506 }
538 507
539 /* Link BUFF into the send queue. */ 508 /* Link BUFF into the send queue. */
509 skb_header_release(buff);
540 __skb_append(skb, buff); 510 __skb_append(skb, buff);
541 511
542 return 0; 512 return 0;
@@ -657,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
657 627
658 /* And store cached results */ 628 /* And store cached results */
659 tp->pmtu_cookie = pmtu; 629 tp->pmtu_cookie = pmtu;
660 tp->mss_cache = tp->mss_cache_std = mss_now; 630 tp->mss_cache = mss_now;
661 631
662 return mss_now; 632 return mss_now;
663} 633}
@@ -669,57 +639,316 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
669 * cannot be large. However, taking into account rare use of URG, this 639 * cannot be large. However, taking into account rare use of URG, this
670 * is not a big flaw. 640 * is not a big flaw.
671 */ 641 */
672 642unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
673unsigned int tcp_current_mss(struct sock *sk, int large)
674{ 643{
675 struct tcp_sock *tp = tcp_sk(sk); 644 struct tcp_sock *tp = tcp_sk(sk);
676 struct dst_entry *dst = __sk_dst_get(sk); 645 struct dst_entry *dst = __sk_dst_get(sk);
677 unsigned int do_large, mss_now; 646 u32 mss_now;
647 u16 xmit_size_goal;
648 int doing_tso = 0;
649
650 mss_now = tp->mss_cache;
651
652 if (large_allowed &&
653 (sk->sk_route_caps & NETIF_F_TSO) &&
654 !tp->urg_mode)
655 doing_tso = 1;
678 656
679 mss_now = tp->mss_cache_std;
680 if (dst) { 657 if (dst) {
681 u32 mtu = dst_mtu(dst); 658 u32 mtu = dst_mtu(dst);
682 if (mtu != tp->pmtu_cookie) 659 if (mtu != tp->pmtu_cookie)
683 mss_now = tcp_sync_mss(sk, mtu); 660 mss_now = tcp_sync_mss(sk, mtu);
684 } 661 }
685 662
686 do_large = (large && 663 if (tp->rx_opt.eff_sacks)
687 (sk->sk_route_caps & NETIF_F_TSO) && 664 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
688 !tp->urg_mode); 665 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
689 666
690 if (do_large) { 667 xmit_size_goal = mss_now;
691 unsigned int large_mss, factor, limit;
692 668
693 large_mss = 65535 - tp->af_specific->net_header_len - 669 if (doing_tso) {
670 xmit_size_goal = 65535 -
671 tp->af_specific->net_header_len -
694 tp->ext_header_len - tp->tcp_header_len; 672 tp->ext_header_len - tp->tcp_header_len;
695 673
696 if (tp->max_window && large_mss > (tp->max_window>>1)) 674 if (tp->max_window &&
697 large_mss = max((tp->max_window>>1), 675 (xmit_size_goal > (tp->max_window >> 1)))
698 68U - tp->tcp_header_len); 676 xmit_size_goal = max((tp->max_window >> 1),
677 68U - tp->tcp_header_len);
678
679 xmit_size_goal -= (xmit_size_goal % mss_now);
680 }
681 tp->xmit_size_goal = xmit_size_goal;
699 682
700 factor = large_mss / mss_now; 683 return mss_now;
684}
701 685
702 /* Always keep large mss multiple of real mss, but 686/* Congestion window validation. (RFC2861) */
703 * do not exceed 1/tso_win_divisor of the congestion window
704 * so we can keep the ACK clock ticking and minimize
705 * bursting.
706 */
707 limit = tp->snd_cwnd;
708 if (sysctl_tcp_tso_win_divisor)
709 limit /= sysctl_tcp_tso_win_divisor;
710 limit = max(1U, limit);
711 if (factor > limit)
712 factor = limit;
713 687
714 tp->mss_cache = mss_now * factor; 688static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
689{
690 __u32 packets_out = tp->packets_out;
691
692 if (packets_out >= tp->snd_cwnd) {
693 /* Network is feed fully. */
694 tp->snd_cwnd_used = 0;
695 tp->snd_cwnd_stamp = tcp_time_stamp;
696 } else {
697 /* Network starves. */
698 if (tp->packets_out > tp->snd_cwnd_used)
699 tp->snd_cwnd_used = tp->packets_out;
715 700
716 mss_now = tp->mss_cache; 701 if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
702 tcp_cwnd_application_limited(sk);
717 } 703 }
704}
718 705
719 if (tp->rx_opt.eff_sacks) 706static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
720 mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + 707{
721 (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); 708 u32 window, cwnd_len;
722 return mss_now; 709
710 window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
711 cwnd_len = mss_now * cwnd;
712 return min(window, cwnd_len);
713}
714
715/* Can at least one segment of SKB be sent right now, according to the
716 * congestion window rules? If so, return how many segments are allowed.
717 */
718static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
719{
720 u32 in_flight, cwnd;
721
722 /* Don't be strict about the congestion window for the final FIN. */
723 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
724 return 1;
725
726 in_flight = tcp_packets_in_flight(tp);
727 cwnd = tp->snd_cwnd;
728 if (in_flight < cwnd)
729 return (cwnd - in_flight);
730
731 return 0;
732}
733
734/* This must be invoked the first time we consider transmitting
735 * SKB onto the wire.
736 */
737static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
738{
739 int tso_segs = tcp_skb_pcount(skb);
740
741 if (!tso_segs) {
742 tcp_set_skb_tso_segs(sk, skb);
743 tso_segs = tcp_skb_pcount(skb);
744 }
745 return tso_segs;
746}
747
748static inline int tcp_minshall_check(const struct tcp_sock *tp)
749{
750 return after(tp->snd_sml,tp->snd_una) &&
751 !after(tp->snd_sml, tp->snd_nxt);
752}
753
754/* Return 0, if packet can be sent now without violation Nagle's rules:
755 * 1. It is full sized.
756 * 2. Or it contains FIN. (already checked by caller)
757 * 3. Or TCP_NODELAY was set.
758 * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
759 * With Minshall's modification: all sent small packets are ACKed.
760 */
761
762static inline int tcp_nagle_check(const struct tcp_sock *tp,
763 const struct sk_buff *skb,
764 unsigned mss_now, int nonagle)
765{
766 return (skb->len < mss_now &&
767 ((nonagle&TCP_NAGLE_CORK) ||
768 (!nonagle &&
769 tp->packets_out &&
770 tcp_minshall_check(tp))));
771}
772
773/* Return non-zero if the Nagle test allows this packet to be
774 * sent now.
775 */
776static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
777 unsigned int cur_mss, int nonagle)
778{
779 /* Nagle rule does not apply to frames, which sit in the middle of the
780 * write_queue (they have no chances to get new data).
781 *
782 * This is implemented in the callers, where they modify the 'nonagle'
783 * argument based upon the location of SKB in the send queue.
784 */
785 if (nonagle & TCP_NAGLE_PUSH)
786 return 1;
787
788 /* Don't use the nagle rule for urgent data (or for the final FIN). */
789 if (tp->urg_mode ||
790 (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
791 return 1;
792
793 if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
794 return 1;
795
796 return 0;
797}
798
799/* Does at least the first segment of SKB fit into the send window? */
800static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
801{
802 u32 end_seq = TCP_SKB_CB(skb)->end_seq;
803
804 if (skb->len > cur_mss)
805 end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
806
807 return !after(end_seq, tp->snd_una + tp->snd_wnd);
808}
809
810/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
811 * should be put on the wire right now. If so, it returns the number of
812 * packets allowed by the congestion window.
813 */
814static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
815 unsigned int cur_mss, int nonagle)
816{
817 struct tcp_sock *tp = tcp_sk(sk);
818 unsigned int cwnd_quota;
819
820 tcp_init_tso_segs(sk, skb);
821
822 if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
823 return 0;
824
825 cwnd_quota = tcp_cwnd_test(tp, skb);
826 if (cwnd_quota &&
827 !tcp_snd_wnd_test(tp, skb, cur_mss))
828 cwnd_quota = 0;
829
830 return cwnd_quota;
831}
832
833static inline int tcp_skb_is_last(const struct sock *sk,
834 const struct sk_buff *skb)
835{
836 return skb->next == (struct sk_buff *)&sk->sk_write_queue;
837}
838
839int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
840{
841 struct sk_buff *skb = sk->sk_send_head;
842
843 return (skb &&
844 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
845 (tcp_skb_is_last(sk, skb) ?
846 TCP_NAGLE_PUSH :
847 tp->nonagle)));
848}
849
850/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
851 * which is put after SKB on the list. It is very much like
852 * tcp_fragment() except that it may make several kinds of assumptions
853 * in order to speed up the splitting operation. In particular, we
854 * know that all the data is in scatter-gather pages, and that the
855 * packet has never been sent out before (and thus is not cloned).
856 */
857static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
858{
859 struct sk_buff *buff;
860 int nlen = skb->len - len;
861 u16 flags;
862
863 /* All of a TSO frame must be composed of paged data. */
864 BUG_ON(skb->len != skb->data_len);
865
866 buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
867 if (unlikely(buff == NULL))
868 return -ENOMEM;
869
870 buff->truesize = nlen;
871 skb->truesize -= nlen;
872
873 /* Correct the sequence numbers. */
874 TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
875 TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
876 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
877
878 /* PSH and FIN should only be set in the second packet. */
879 flags = TCP_SKB_CB(skb)->flags;
880 TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
881 TCP_SKB_CB(buff)->flags = flags;
882
883 /* This packet was never sent out yet, so no SACK bits. */
884 TCP_SKB_CB(buff)->sacked = 0;
885
886 buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
887 skb_split(skb, buff, len);
888
889 /* Fix up tso_factor for both original and new SKB. */
890 tcp_set_skb_tso_segs(sk, skb);
891 tcp_set_skb_tso_segs(sk, buff);
892
893 /* Link BUFF into the send queue. */
894 skb_header_release(buff);
895 __skb_append(skb, buff);
896
897 return 0;
898}
899
900/* Try to defer sending, if possible, in order to minimize the amount
901 * of TSO splitting we do. View it as a kind of TSO Nagle test.
902 *
903 * This algorithm is from John Heffner.
904 */
905static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
906{
907 u32 send_win, cong_win, limit, in_flight;
908
909 if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
910 return 0;
911
912 if (tp->ca_state != TCP_CA_Open)
913 return 0;
914
915 in_flight = tcp_packets_in_flight(tp);
916
917 BUG_ON(tcp_skb_pcount(skb) <= 1 ||
918 (tp->snd_cwnd <= in_flight));
919
920 send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
921
922 /* From in_flight test above, we know that cwnd > in_flight. */
923 cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
924
925 limit = min(send_win, cong_win);
926
927 /* If sk_send_head can be sent fully now, just do it. */
928 if (skb->len <= limit)
929 return 0;
930
931 if (sysctl_tcp_tso_win_divisor) {
932 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
933
934 /* If at least some fraction of a window is available,
935 * just use it.
936 */
937 chunk /= sysctl_tcp_tso_win_divisor;
938 if (limit >= chunk)
939 return 0;
940 } else {
941 /* Different approach, try not to defer past a single
942 * ACK. Receiver should ACK every other full sized
943 * frame, so if we have space for more than 3 frames
944 * then send now.
945 */
946 if (limit > tcp_max_burst(tp) * tp->mss_cache)
947 return 0;
948 }
949
950 /* Ok, it looks like it is advisable to defer. */
951 return 1;
723} 952}
724 953
725/* This routine writes packets to the network. It advances the 954/* This routine writes packets to the network. It advances the
@@ -729,57 +958,158 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
729 * Returns 1, if no segments are in flight and we have queued segments, but 958 * Returns 1, if no segments are in flight and we have queued segments, but
730 * cannot send anything now because of SWS or another problem. 959 * cannot send anything now because of SWS or another problem.
731 */ 960 */
732int tcp_write_xmit(struct sock *sk, int nonagle) 961static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
733{ 962{
734 struct tcp_sock *tp = tcp_sk(sk); 963 struct tcp_sock *tp = tcp_sk(sk);
735 unsigned int mss_now; 964 struct sk_buff *skb;
965 unsigned int tso_segs, sent_pkts;
966 int cwnd_quota;
736 967
737 /* If we are closed, the bytes will have to remain here. 968 /* If we are closed, the bytes will have to remain here.
738 * In time closedown will finish, we empty the write queue and all 969 * In time closedown will finish, we empty the write queue and all
739 * will be happy. 970 * will be happy.
740 */ 971 */
741 if (sk->sk_state != TCP_CLOSE) { 972 if (unlikely(sk->sk_state == TCP_CLOSE))
742 struct sk_buff *skb; 973 return 0;
743 int sent_pkts = 0; 974
975 skb = sk->sk_send_head;
976 if (unlikely(!skb))
977 return 0;
978
979 tso_segs = tcp_init_tso_segs(sk, skb);
980 cwnd_quota = tcp_cwnd_test(tp, skb);
981 if (unlikely(!cwnd_quota))
982 goto out;
983
984 sent_pkts = 0;
985 while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
986 BUG_ON(!tso_segs);
987
988 if (tso_segs == 1) {
989 if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
990 (tcp_skb_is_last(sk, skb) ?
991 nonagle : TCP_NAGLE_PUSH))))
992 break;
993 } else {
994 if (tcp_tso_should_defer(sk, tp, skb))
995 break;
996 }
744 997
745 /* Account for SACKS, we may need to fragment due to this. 998 if (tso_segs > 1) {
746 * It is just like the real MSS changing on us midstream. 999 u32 limit = tcp_window_allows(tp, skb,
747 * We also handle things correctly when the user adds some 1000 mss_now, cwnd_quota);
748 * IP options mid-stream. Silly to do, but cover it. 1001
749 */ 1002 if (skb->len < limit) {
750 mss_now = tcp_current_mss(sk, 1); 1003 unsigned int trim = skb->len % mss_now;
751 1004
752 while ((skb = sk->sk_send_head) && 1005 if (trim)
753 tcp_snd_test(sk, skb, mss_now, 1006 limit = skb->len - trim;
754 tcp_skb_is_last(sk, skb) ? nonagle : 1007 }
755 TCP_NAGLE_PUSH)) { 1008 if (skb->len > limit) {
756 if (skb->len > mss_now) { 1009 if (tso_fragment(sk, skb, limit))
757 if (tcp_fragment(sk, skb, mss_now))
758 break; 1010 break;
759 } 1011 }
760 1012 } else if (unlikely(skb->len > mss_now)) {
761 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1013 if (unlikely(tcp_fragment(sk, skb, mss_now)))
762 tcp_tso_set_push(skb);
763 if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
764 break; 1014 break;
1015 }
765 1016
766 /* Advance the send_head. This one is sent out. 1017 TCP_SKB_CB(skb)->when = tcp_time_stamp;
767 * This call will increment packets_out. 1018
768 */ 1019 if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
769 update_send_head(sk, tp, skb); 1020 break;
1021
1022 /* Advance the send_head. This one is sent out.
1023 * This call will increment packets_out.
1024 */
1025 update_send_head(sk, tp, skb);
1026
1027 tcp_minshall_update(tp, mss_now, skb);
1028 sent_pkts++;
1029
1030 /* Do not optimize this to use tso_segs. If we chopped up
1031 * the packet above, tso_segs will no longer be valid.
1032 */
1033 cwnd_quota -= tcp_skb_pcount(skb);
1034
1035 BUG_ON(cwnd_quota < 0);
1036 if (!cwnd_quota)
1037 break;
1038
1039 skb = sk->sk_send_head;
1040 if (!skb)
1041 break;
1042 tso_segs = tcp_init_tso_segs(sk, skb);
1043 }
1044
1045 if (likely(sent_pkts)) {
1046 tcp_cwnd_validate(sk, tp);
1047 return 0;
1048 }
1049out:
1050 return !tp->packets_out && sk->sk_send_head;
1051}
1052
1053/* Push out any pending frames which were held back due to
1054 * TCP_CORK or attempt at coalescing tiny packets.
1055 * The socket must be locked by the caller.
1056 */
1057void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
1058 unsigned int cur_mss, int nonagle)
1059{
1060 struct sk_buff *skb = sk->sk_send_head;
770 1061
771 tcp_minshall_update(tp, mss_now, skb); 1062 if (skb) {
772 sent_pkts = 1; 1063 if (tcp_write_xmit(sk, cur_mss, nonagle))
1064 tcp_check_probe_timer(sk, tp);
1065 }
1066}
1067
1068/* Send _single_ skb sitting at the send head. This function requires
1069 * true push pending frames to setup probe timer etc.
1070 */
1071void tcp_push_one(struct sock *sk, unsigned int mss_now)
1072{
1073 struct tcp_sock *tp = tcp_sk(sk);
1074 struct sk_buff *skb = sk->sk_send_head;
1075 unsigned int tso_segs, cwnd_quota;
1076
1077 BUG_ON(!skb || skb->len < mss_now);
1078
1079 tso_segs = tcp_init_tso_segs(sk, skb);
1080 cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
1081
1082 if (likely(cwnd_quota)) {
1083 BUG_ON(!tso_segs);
1084
1085 if (tso_segs > 1) {
1086 u32 limit = tcp_window_allows(tp, skb,
1087 mss_now, cwnd_quota);
1088
1089 if (skb->len < limit) {
1090 unsigned int trim = skb->len % mss_now;
1091
1092 if (trim)
1093 limit = skb->len - trim;
1094 }
1095 if (skb->len > limit) {
1096 if (unlikely(tso_fragment(sk, skb, limit)))
1097 return;
1098 }
1099 } else if (unlikely(skb->len > mss_now)) {
1100 if (unlikely(tcp_fragment(sk, skb, mss_now)))
1101 return;
773 } 1102 }
774 1103
775 if (sent_pkts) { 1104 /* Send it out now. */
1105 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1106
1107 if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
1108 update_send_head(sk, tp, skb);
776 tcp_cwnd_validate(sk, tp); 1109 tcp_cwnd_validate(sk, tp);
777 return 0; 1110 return;
778 } 1111 }
779
780 return !tp->packets_out && sk->sk_send_head;
781 } 1112 }
782 return 0;
783} 1113}
784 1114
785/* This function returns the amount that we can raise the 1115/* This function returns the amount that we can raise the
@@ -1039,7 +1369,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1039 if (sk->sk_route_caps & NETIF_F_TSO) { 1369 if (sk->sk_route_caps & NETIF_F_TSO) {
1040 sk->sk_route_caps &= ~NETIF_F_TSO; 1370 sk->sk_route_caps &= ~NETIF_F_TSO;
1041 sock_set_flag(sk, SOCK_NO_LARGESEND); 1371 sock_set_flag(sk, SOCK_NO_LARGESEND);
1042 tp->mss_cache = tp->mss_cache_std;
1043 } 1372 }
1044 1373
1045 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) 1374 if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1101,7 +1430,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1101 * is still in somebody's hands, else make a clone. 1430 * is still in somebody's hands, else make a clone.
1102 */ 1431 */
1103 TCP_SKB_CB(skb)->when = tcp_time_stamp; 1432 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1104 tcp_tso_set_push(skb);
1105 1433
1106 err = tcp_transmit_skb(sk, (skb_cloned(skb) ? 1434 err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
1107 pskb_copy(skb, GFP_ATOMIC): 1435 pskb_copy(skb, GFP_ATOMIC):
@@ -1285,7 +1613,7 @@ void tcp_send_fin(struct sock *sk)
1285 * was unread data in the receive queue. This behavior is recommended 1613 * was unread data in the receive queue. This behavior is recommended
1286 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM 1614 * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
1287 */ 1615 */
1288void tcp_send_active_reset(struct sock *sk, int priority) 1616void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
1289{ 1617{
1290 struct tcp_sock *tp = tcp_sk(sk); 1618 struct tcp_sock *tp = tcp_sk(sk);
1291 struct sk_buff *skb; 1619 struct sk_buff *skb;
@@ -1670,14 +1998,12 @@ int tcp_write_wakeup(struct sock *sk)
1670 if (sk->sk_route_caps & NETIF_F_TSO) { 1998 if (sk->sk_route_caps & NETIF_F_TSO) {
1671 sock_set_flag(sk, SOCK_NO_LARGESEND); 1999 sock_set_flag(sk, SOCK_NO_LARGESEND);
1672 sk->sk_route_caps &= ~NETIF_F_TSO; 2000 sk->sk_route_caps &= ~NETIF_F_TSO;
1673 tp->mss_cache = tp->mss_cache_std;
1674 } 2001 }
1675 } else if (!tcp_skb_pcount(skb)) 2002 } else if (!tcp_skb_pcount(skb))
1676 tcp_set_skb_tso_segs(sk, skb); 2003 tcp_set_skb_tso_segs(sk, skb);
1677 2004
1678 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; 2005 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1679 TCP_SKB_CB(skb)->when = tcp_time_stamp; 2006 TCP_SKB_CB(skb)->when = tcp_time_stamp;
1680 tcp_tso_set_push(skb);
1681 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); 2007 err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
1682 if (!err) { 2008 if (!err) {
1683 update_send_head(sk, tp, skb); 2009 update_send_head(sk, tp, skb);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b127b4498565..0084227438c2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -231,11 +231,10 @@ static void tcp_delack_timer(unsigned long data)
231 } 231 }
232 tp->ack.pending &= ~TCP_ACK_TIMER; 232 tp->ack.pending &= ~TCP_ACK_TIMER;
233 233
234 if (skb_queue_len(&tp->ucopy.prequeue)) { 234 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
235 struct sk_buff *skb; 235 struct sk_buff *skb;
236 236
237 NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED, 237 NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
238 skb_queue_len(&tp->ucopy.prequeue));
239 238
240 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) 239 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
241 sk->sk_backlog_rcv(sk, skb); 240 sk->sk_backlog_rcv(sk, skb);