Merge with /home/shaggy/git/linus-clean/

Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
author: Dave Kleikamp <shaggy@austin.ibm.com> 2005-07-13 09:57:38 -0400
committer: Dave Kleikamp <shaggy@austin.ibm.com> 2005-07-13 09:57:38 -0400
commit: f7f24758ac98a506770bc5910d33567610fa3403 (patch)
tree: ff7fad3d01bf9dc2e2e54b908f9fca4891e1ee72 /net
parent: b38a3ab3d1bb0dc3288f73903d4dc4672b5cd2d0 (diff)
parent: c32511e2718618f0b53479eb36e07439aa363a74 (diff)
121 files changed, 4455 insertions, 2180 deletions
diff --git a/net/802/fddi.c b/net/802/fddi.c
index ebcf4830d6f1..5ce24c4bb840 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -122,10 +122,10 @@ static int fddi_rebuild_header(struct sk_buff	*skb)
 * the proper pointer to the start of packet data (skb->data).
 */
 
-unsigned short fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
        struct fddihdr *fddi = (struct fddihdr *)skb->data;
-        unsigned short type;
+        __be16 type;
        
        /*
         * Set mac.raw field to point to FC byte, set data field to point
diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig
new file mode 100644
index 000000000000..c4a382e450e2
--- /dev/null
+++ b/net/8021q/Kconfig
@@ -0,0 +1,19 @@
+#
+# Configuration for 802.1Q VLAN support
+#
+config VLAN_8021Q
+        tristate "802.1Q VLAN Support"
+        ---help---
+          Select this and you will be able to create 802.1Q VLAN interfaces
+          on your ethernet interfaces.  802.1Q VLAN supports almost
+          everything a regular ethernet interface does, including
+          firewalling, bridging, and of course IP traffic.  You will need
+          the 'vconfig' tool from the VLAN project in order to effectively
+          use VLANs.  See the VLAN web page for more information:
+          <http://www.candelatech.com/~greear/vlan.html>
+          To compile this code as a module, choose M here: the module
+          will be called 8021q.
+          If unsure, say N.
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 1f6d31670bc7..91e412b0ab00 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -578,6 +578,14 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
                        if (!vlandev)
                                continue;
+                        if (netif_carrier_ok(dev)) {
+                                if (!netif_carrier_ok(vlandev))
+                                        netif_carrier_on(vlandev);
+                        } else {
+                                if (netif_carrier_ok(vlandev))
+                                        netif_carrier_off(vlandev);
+                        }
                        if ((vlandev->state & VLAN_LINK_STATE_MASK) != flgs) {
                                vlandev->state = (vlandev->state &~ VLAN_LINK_STATE_MASK) 
                                        | flgs;
diff --git a/net/Kconfig b/net/Kconfig
index 9251b28e8d5d..2684e809a649 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -2,7 +2,7 @@
 # Network configuration
 #
-menu "Networking support"
+menu "Networking"
 config NET
        bool "Networking support"
@@ -10,7 +10,9 @@ config NET
          Unless you really know what you are doing, you should say Y here.
          The reason is that some programs need kernel networking support even
          when running on a stand-alone machine that isn't connected to any
-          other computer. If you are upgrading from an older kernel, you
+          other computer.
+          
+          If you are upgrading from an older kernel, you
          should consider updating your networking tools too because changes
          in the kernel and the tools often go hand in hand. The tools are
          contained in the package net-tools, the location and version number
@@ -20,57 +22,14 @@ config NET
          recommended to read the NET-HOWTO, available from
          <http://www.tldp.org/docs.html#howto>.
-menu "Networking options"
+# Make sure that all config symbols are dependent on NET
-        depends on NET
+if NET
-config PACKET
-        tristate "Packet socket"
-        ---help---
-          The Packet protocol is used by applications which communicate
-          directly with network devices without an intermediate network
-          protocol implemented in the kernel, e.g. tcpdump.  If you want them
-          to work, choose Y.
-          To compile this driver as a module, choose M here: the module will
+menu "Networking options"
-          be called af_packet.
-          If unsure, say Y.
-config PACKET_MMAP
-        bool "Packet socket: mmapped IO"
-        depends on PACKET
-        help
-          If you say Y here, the Packet protocol driver will use an IO
-          mechanism that results in faster communication.
-          If unsure, say N.
-config UNIX
-        tristate "Unix domain sockets"
-        ---help---
-          If you say Y here, you will include support for Unix domain sockets;
-          sockets are the standard Unix mechanism for establishing and
-          accessing network connections.  Many commonly used programs such as
-          the X Window system and syslog use these sockets even if your
-          machine is not connected to any network.  Unless you are working on
-          an embedded system or something similar, you therefore definitely
-          want to say Y here.
-          To compile this driver as a module, choose M here: the module will be
-          called unix.  Note that several important services won't work
-          correctly if you say M here and then neglect to load the module.
-          Say Y unless you know what you are doing.
-config NET_KEY
-        tristate "PF_KEY sockets"
-        select XFRM
-        ---help---
-          PF_KEYv2 socket family, compatible to KAME ones.
-          They are required if you are going to use IPsec tools ported
-          from KAME.
-          Say Y unless you know what you are doing.
+source "net/packet/Kconfig"
+source "net/unix/Kconfig"
+source "net/xfrm/Kconfig"
 config INET
        bool "TCP/IP networking"
@@ -94,30 +53,12 @@ config INET
          Short answer: say Y.
+if INET
 source "net/ipv4/Kconfig"
-#   IPv6 as module will cause a CRASH if you try to unload it
-config IPV6
-        tristate "The IPv6 protocol"
-        depends on INET
-        default m
-        select CRYPTO if IPV6_PRIVACY
-        select CRYPTO_MD5 if IPV6_PRIVACY
-        ---help---
-          This is complemental support for the IP version 6.
-          You will still be able to do traditional IPv4 networking as well.
-          For general information about IPv6, see
-          <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
-          For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
-          For specific information about IPv6 under Linux, read the HOWTO at
-          <http://www.bieringer.de/linux/IPv6/>.
-          To compile this protocol support as a module, choose M here: the 
-          module will be called ipv6.
 source "net/ipv6/Kconfig"
+endif # if INET
 menuconfig NETFILTER
        bool "Network packet filtering (replaces ipchains)"
        ---help---
@@ -206,269 +147,16 @@ source "net/bridge/netfilter/Kconfig"
 endif
-config XFRM
-       bool
-       depends on NET
-source "net/xfrm/Kconfig"
 source "net/sctp/Kconfig"
+source "net/atm/Kconfig"
-config ATM
+source "net/bridge/Kconfig"
-        tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)"
+source "net/8021q/Kconfig"
-        depends on EXPERIMENTAL
-        ---help---
-          ATM is a high-speed networking technology for Local Area Networks
-          and Wide Area Networks.  It uses a fixed packet size and is
-          connection oriented, allowing for the negotiation of minimum
-          bandwidth requirements.
-          In order to participate in an ATM network, your Linux box needs an
-          ATM networking card. If you have that, say Y here and to the driver
-          of your ATM card below.
-          Note that you need a set of user-space programs to actually make use
-          of ATM.  See the file <file:Documentation/networking/atm.txt> for
-          further details.
-config ATM_CLIP
-        tristate "Classical IP over ATM (EXPERIMENTAL)"
-        depends on ATM && INET
-        help
-          Classical IP over ATM for PVCs and SVCs, supporting InARP and
-          ATMARP. If you want to communication with other IP hosts on your ATM
-          network, you will typically either say Y here or to "LAN Emulation
-          (LANE)" below.
-config ATM_CLIP_NO_ICMP
-        bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)"
-        depends on ATM_CLIP
-        help
-          Normally, an "ICMP host unreachable" message is sent if a neighbour
-          cannot be reached because there is no VC to it in the kernel's
-          ATMARP table. This may cause problems when ATMARP table entries are
-          briefly removed during revalidation. If you say Y here, packets to
-          such neighbours are silently discarded instead.
-config ATM_LANE
-        tristate "LAN Emulation (LANE) support (EXPERIMENTAL)"
-        depends on ATM
-        help
-          LAN Emulation emulates services of existing LANs across an ATM
-          network. Besides operating as a normal ATM end station client, Linux
-          LANE client can also act as an proxy client bridging packets between
-          ELAN and Ethernet segments. You need LANE if you want to try MPOA.
-config ATM_MPOA
-        tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)"
-        depends on ATM && INET && ATM_LANE!=n
-        help
-          Multi-Protocol Over ATM allows ATM edge devices such as routers,
-          bridges and ATM attached hosts establish direct ATM VCs across
-          subnetwork boundaries. These shortcut connections bypass routers
-          enhancing overall network performance.
-config ATM_BR2684
-        tristate "RFC1483/2684 Bridged protocols"
-        depends on ATM && INET
-        help
-          ATM PVCs can carry ethernet PDUs according to rfc2684 (formerly 1483)
-          This device will act like an ethernet from the kernels point of view,
-          with the traffic being carried by ATM PVCs (currently 1 PVC/device).
-          This is sometimes used over DSL lines.  If in doubt, say N.
-config ATM_BR2684_IPFILTER
-        bool "Per-VC IP filter kludge"
-        depends on ATM_BR2684
-        help
-          This is an experimental mechanism for users who need to terminating a
-          large number of IP-only vcc's.  Do not enable this unless you are sure
-          you know what you are doing.
-config BRIDGE
-        tristate "802.1d Ethernet Bridging"
-        ---help---
-          If you say Y here, then your Linux box will be able to act as an
-          Ethernet bridge, which means that the different Ethernet segments it
-          is connected to will appear as one Ethernet to the participants.
-          Several such bridges can work together to create even larger
-          networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
-          As this is a standard, Linux bridges will cooperate properly with
-          other third party bridge products.
-          In order to use the Ethernet bridge, you'll need the bridge
-          configuration tools; see <file:Documentation/networking/bridge.txt>
-          for location. Please read the Bridge mini-HOWTO for more
-          information.
-          If you enable iptables support along with the bridge support then you
-          turn your bridge into a bridging IP firewall.
-          iptables will then see the IP packets being bridged, so you need to
-          take this into account when setting up your firewall rules.
-          Enabling arptables support when bridging will let arptables see
-          bridged ARP traffic in the arptables FORWARD chain.
-          To compile this code as a module, choose M here: the module
-          will be called bridge.
-          If unsure, say N.
-config VLAN_8021Q
-        tristate "802.1Q VLAN Support"
-        ---help---
-          Select this and you will be able to create 802.1Q VLAN interfaces
-          on your ethernet interfaces.  802.1Q VLAN supports almost
-          everything a regular ethernet interface does, including
-          firewalling, bridging, and of course IP traffic.  You will need
-          the 'vconfig' tool from the VLAN project in order to effectively
-          use VLANs.  See the VLAN web page for more information:
-          <http://www.candelatech.com/~greear/vlan.html>
-          To compile this code as a module, choose M here: the module
-          will be called 8021q.
-          If unsure, say N.
-config DECNET
-        tristate "DECnet Support"
-        ---help---
-          The DECnet networking protocol was used in many products made by
-          Digital (now Compaq).  It provides reliable stream and sequenced
-          packet communications over which run a variety of services similar
-          to those which run over TCP/IP.
-          To find some tools to use with the kernel layer support, please
-          look at Patrick Caulfield's web site:
-          <http://linux-decnet.sourceforge.net/>.
-          More detailed documentation is available in
-          <file:Documentation/networking/decnet.txt>.
-          Be sure to say Y to "/proc file system support" and "Sysctl support"
-          below when using DECnet, since you will need sysctl support to aid
-          in configuration at run time.
-          The DECnet code is also available as a module ( = code which can be
-          inserted in and removed from the running kernel whenever you want).
-          The module is called decnet.
 source "net/decnet/Kconfig"
 source "net/llc/Kconfig"
-config IPX
-        tristate "The IPX protocol"
-        select LLC
-        ---help---
-          This is support for the Novell networking protocol, IPX, commonly
-          used for local networks of Windows machines.  You need it if you
-          want to access Novell NetWare file or print servers using the Linux
-          Novell client ncpfs (available from
-          <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
-          within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
-          available from <http://www.tldp.org/docs.html#howto>).  In order
-          to do the former, you'll also have to say Y to "NCP file system
-          support", below.
-          IPX is similar in scope to IP, while SPX, which runs on top of IPX,
-          is similar to TCP. There is also experimental support for SPX in
-          Linux (see "SPX networking", below).
-          To turn your Linux box into a fully featured NetWare file server and
-          IPX router, say Y here and fetch either lwared from
-          <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
-          mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
-          information, read the IPX-HOWTO available from
-          <http://www.tldp.org/docs.html#howto>.
-          General information about how to connect Linux, Windows machines and
-          Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
-          The IPX driver would enlarge your kernel by about 16 KB. To compile
-          this driver as a module, choose M here: the module will be called ipx.
-          Unless you want to integrate your Linux box with a local Novell
-          network, say N.
 source "net/ipx/Kconfig"
-config ATALK
-        tristate "Appletalk protocol support"
-        select LLC
-        ---help---
-          AppleTalk is the protocol that Apple computers can use to communicate
-          on a network.  If your Linux box is connected to such a network and you
-          wish to connect to it, say Y.  You will need to use the netatalk package
-          so that your Linux box can act as a print and file server for Macs as
-          well as access AppleTalk printers.  Check out
-          <http://www.zettabyte.net/netatalk/> on the WWW for details.
-          EtherTalk is the name used for AppleTalk over Ethernet and the
-          cheaper and slower LocalTalk is AppleTalk over a proprietary Apple
-          network using serial links.  EtherTalk and LocalTalk are fully
-          supported by Linux.
-          General information about how to connect Linux, Windows machines and
-          Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.  The
-          NET-3-HOWTO, available from
-          <http://www.tldp.org/docs.html#howto>, contains valuable
-          information as well.
-          To compile this driver as a module, choose M here: the module will be
-          called appletalk. You almost certainly want to compile it as a
-          module so you can restart your AppleTalk stack without rebooting
-          your machine. I hear that the GNU boycott of Apple is over, so
-          even politically correct people are allowed to say Y here.
 source "drivers/net/appletalk/Kconfig"
+source "net/x25/Kconfig"
-config X25
+source "net/lapb/Kconfig"
-        tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
-        depends on EXPERIMENTAL
-        ---help---
-          X.25 is a set of standardized network protocols, similar in scope to
-          frame relay; the one physical line from your box to the X.25 network
-          entry point can carry several logical point-to-point connections
-          (called "virtual circuits") to other computers connected to the X.25
-          network. Governments, banks, and other organizations tend to use it
-          to connect to each other or to form Wide Area Networks (WANs). Many
-          countries have public X.25 networks. X.25 consists of two
-          protocols: the higher level Packet Layer Protocol (PLP) (say Y here
-          if you want that) and the lower level data link layer protocol LAPB
-          (say Y to "LAPB Data Link Driver" below if you want that).
-          You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
-          <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
-          Information about X.25 for Linux is contained in the files
-          <file:Documentation/networking/x25.txt> and
-          <file:Documentation/networking/x25-iface.txt>.
-          One connects to an X.25 network either with a dedicated network card
-          using the X.21 protocol (not yet supported by Linux) or one can do
-          X.25 over a standard telephone line using an ordinary modem (say Y
-          to "X.25 async driver" below) or over Ethernet using an ordinary
-          Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
-          Driver" and "LAPB over Ethernet driver" below).
-          To compile this driver as a module, choose M here: the module
-          will be called x25. If unsure, say N.
-config LAPB
-        tristate "LAPB Data Link Driver (EXPERIMENTAL)"
-        depends on EXPERIMENTAL
-        ---help---
-          Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
-          the lower) part of the X.25 protocol. It offers a reliable
-          connection service to exchange data frames with one other host, and
-          it is used to transport higher level protocols (mostly X.25 Packet
-          Layer, the higher part of X.25, but others are possible as well).
-          Usually, LAPB is used with specialized X.21 network cards, but Linux
-          currently supports LAPB only over Ethernet connections. If you want
-          to use LAPB connections over Ethernet, say Y here and to "LAPB over
-          Ethernet driver" below. Read
-          <file:Documentation/networking/lapb-module.txt> for technical
-          details.
-          To compile this driver as a module, choose M here: the
-          module will be called lapb.  If unsure, say N.
 config NET_DIVERT
        bool "Frame Diverter (EXPERIMENTAL)"
@@ -496,107 +184,10 @@ config NET_DIVERT
          If unsure, say N.
-config ECONET
+source "net/econet/Kconfig"
-        tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)"
+source "net/wanrouter/Kconfig"
-        depends on EXPERIMENTAL && INET
-        ---help---
-          Econet is a fairly old and slow networking protocol mainly used by
-          Acorn computers to access file and print servers. It uses native
-          Econet network cards. AUN is an implementation of the higher level
-          parts of Econet that runs over ordinary Ethernet connections, on
-          top of the UDP packet protocol, which in turn runs on top of the
-          Internet protocol IP.
-          If you say Y here, you can choose with the next two options whether
-          to send Econet/AUN traffic over a UDP Ethernet connection or over
-          a native Econet network card.
-          To compile this driver as a module, choose M here: the module
-          will be called econet.
-config ECONET_AUNUDP
-        bool "AUN over UDP"
-        depends on ECONET
-        help
-          Say Y here if you want to send Econet/AUN traffic over a UDP
-          connection (UDP is a packet based protocol that runs on top of the
-          Internet protocol IP) using an ordinary Ethernet network card.
-config ECONET_NATIVE
-        bool "Native Econet"
-        depends on ECONET
-        help
-          Say Y here if you have a native Econet network card installed in
-          your computer.
-config WAN_ROUTER
-        tristate "WAN router"
-        depends on EXPERIMENTAL
-        ---help---
-          Wide Area Networks (WANs), such as X.25, frame relay and leased
-          lines, are used to interconnect Local Area Networks (LANs) over vast
-          distances with data transfer rates significantly higher than those
-          achievable with commonly used asynchronous modem connections.
-          Usually, a quite expensive external device called a `WAN router' is
-          needed to connect to a WAN.
-          As an alternative, WAN routing can be built into the Linux kernel.
-          With relatively inexpensive WAN interface cards available on the
-          market, a perfectly usable router can be built for less than half
-          the price of an external router.  If you have one of those cards and
-          wish to use your Linux box as a WAN router, say Y here and also to
-          the WAN driver for your card, below.  You will then need the
-          wan-tools package which is available from <ftp://ftp.sangoma.com/>.
-          Read <file:Documentation/networking/wan-router.txt> for more
-          information.
-          To compile WAN routing support as a module, choose M here: the
-          module will be called wanrouter.
-          If unsure, say N.
-menu "QoS and/or fair queueing"
-config NET_SCHED
-        bool "QoS and/or fair queueing"
-        ---help---
-          When the kernel has several packets to send out over a network
-          device, it has to decide which ones to send first, which ones to
-          delay, and which ones to drop. This is the job of the packet
-          scheduler, and several different algorithms for how to do this
-          "fairly" have been proposed.
-          If you say N here, you will get the standard packet scheduler, which
-          is a FIFO (first come, first served). If you say Y here, you will be
-          able to choose from among several alternative algorithms which can
-          then be attached to different network devices. This is useful for
-          example if some of your network devices are real time devices that
-          need a certain minimum data flow rate, or if you need to limit the
-          maximum data flow rate for traffic which matches specified criteria.
-          This code is considered to be experimental.
-          To administer these schedulers, you'll need the user-level utilities
-          from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
-          That package also contains some documentation; for more, check out
-          <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
-          This Quality of Service (QoS) support will enable you to use
-          Differentiated Services (diffserv) and Resource Reservation Protocol
-          (RSVP) on your Linux router if you also say Y to "QoS support",
-          "Packet classifier API" and to some classifiers below. Documentation
-          and software is at <http://diffserv.sourceforge.net/>.
-          If you say Y here and to "/proc file system" below, you will be able
-          to read status information about packet schedulers from the file
-          /proc/net/psched.
-          The available schedulers are listed in the following questions; you
-          can say Y to as many as you like. If unsure, say N now.
 source "net/sched/Kconfig"
-endmenu
 menu "Network testing"
 config NET_PKTGEN
@@ -635,12 +226,9 @@ config NET_POLL_CONTROLLER
        def_bool NETPOLL
 source "net/ax25/Kconfig"
 source "net/irda/Kconfig"
 source "net/bluetooth/Kconfig"
-source "drivers/net/Kconfig"
+endif   # if NET
+endmenu # Networking
-endmenu
diff --git a/net/atm/Kconfig b/net/atm/Kconfig
new file mode 100644
index 000000000000..bea2426229b1
--- /dev/null
+++ b/net/atm/Kconfig
@@ -0,0 +1,74 @@
+#
+# Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)
+#
+config ATM
+        tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)"
+        depends on EXPERIMENTAL
+        ---help---
+          ATM is a high-speed networking technology for Local Area Networks
+          and Wide Area Networks.  It uses a fixed packet size and is
+          connection oriented, allowing for the negotiation of minimum
+          bandwidth requirements.
+          In order to participate in an ATM network, your Linux box needs an
+          ATM networking card. If you have that, say Y here and to the driver
+          of your ATM card below.
+          Note that you need a set of user-space programs to actually make use
+          of ATM.  See the file <file:Documentation/networking/atm.txt> for
+          further details.
+config ATM_CLIP
+        tristate "Classical IP over ATM (EXPERIMENTAL)"
+        depends on ATM && INET
+        help
+          Classical IP over ATM for PVCs and SVCs, supporting InARP and
+          ATMARP. If you want to communication with other IP hosts on your ATM
+          network, you will typically either say Y here or to "LAN Emulation
+          (LANE)" below.
+config ATM_CLIP_NO_ICMP
+        bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)"
+        depends on ATM_CLIP
+        help
+          Normally, an "ICMP host unreachable" message is sent if a neighbour
+          cannot be reached because there is no VC to it in the kernel's
+          ATMARP table. This may cause problems when ATMARP table entries are
+          briefly removed during revalidation. If you say Y here, packets to
+          such neighbours are silently discarded instead.
+config ATM_LANE
+        tristate "LAN Emulation (LANE) support (EXPERIMENTAL)"
+        depends on ATM
+        help
+          LAN Emulation emulates services of existing LANs across an ATM
+          network. Besides operating as a normal ATM end station client, Linux
+          LANE client can also act as an proxy client bridging packets between
+          ELAN and Ethernet segments. You need LANE if you want to try MPOA.
+config ATM_MPOA
+        tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)"
+        depends on ATM && INET && ATM_LANE!=n
+        help
+          Multi-Protocol Over ATM allows ATM edge devices such as routers,
+          bridges and ATM attached hosts establish direct ATM VCs across
+          subnetwork boundaries. These shortcut connections bypass routers
+          enhancing overall network performance.
+config ATM_BR2684
+        tristate "RFC1483/2684 Bridged protocols"
+        depends on ATM && INET
+        help
+          ATM PVCs can carry ethernet PDUs according to rfc2684 (formerly 1483)
+          This device will act like an ethernet from the kernels point of view,
+          with the traffic being carried by ATM PVCs (currently 1 PVC/device).
+          This is sometimes used over DSL lines.  If in doubt, say N.
+config ATM_BR2684_IPFILTER
+        bool "Per-VC IP filter kludge"
+        depends on ATM_BR2684
+        help
+          This is an experimental mechanism for users who need to terminating a
+          large number of IP-only vcc's.  Do not enable this unless you are sure
+          you know what you are doing.
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index e6954cf1459d..289956c4dd3e 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -289,8 +289,7 @@ xmit will add the additional header part in that case */
 * This is similar to eth_type_trans, which cannot be used because of
 * our dev->hard_header_len
 */
-static inline unsigned short br_type_trans(struct sk_buff *skb,
+static inline __be16 br_type_trans(struct sk_buff *skb, struct net_device *dev)
-                                               struct net_device *dev)
 {
        struct ethhdr *eth;
        unsigned char *rawp;
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 2e341de3e763..901eff7ebe74 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -213,7 +213,7 @@ static int cmtp_send_frame(struct cmtp_session *session, unsigned char *data, in
        return kernel_sendmsg(sock, &msg, &iv, 1, len);
 }
-static int cmtp_process_transmit(struct cmtp_session *session)
+static void cmtp_process_transmit(struct cmtp_session *session)
 {
        struct sk_buff *skb, *nskb;
        unsigned char *hdr;
@@ -223,7 +223,7 @@ static int cmtp_process_transmit(struct cmtp_session *session)
        if (!(nskb = alloc_skb(session->mtu, GFP_ATOMIC))) {
                BT_ERR("Can't allocate memory for new frame");
-                return -ENOMEM;
+                return;
        }
        while ((skb = skb_dequeue(&session->transmit))) {
@@ -275,8 +275,6 @@ static int cmtp_process_transmit(struct cmtp_session *session)
        cmtp_send_frame(session, nskb->data, nskb->len);
        kfree_skb(nskb);
-        return skb_queue_len(&session->transmit);
 }
 static int cmtp_session(void *arg)
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index affbc55462e8..de8af5f42394 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -428,7 +428,7 @@ static int hidp_send_frame(struct socket *sock, unsigned char *data, int len)
        return kernel_sendmsg(sock, &msg, &iv, 1, len);
 }
-static int hidp_process_transmit(struct hidp_session *session)
+static void hidp_process_transmit(struct hidp_session *session)
 {
        struct sk_buff *skb;
@@ -453,9 +453,6 @@ static int hidp_process_transmit(struct hidp_session *session)
                hidp_set_timer(session);
                kfree_skb(skb);
        }
-        return skb_queue_len(&session->ctrl_transmit) +
-                                skb_queue_len(&session->intr_transmit);
 }
 static int hidp_session(void *arg)
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index f3f6355a2786..63a123c5c41b 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -590,8 +590,11 @@ static long rfcomm_sock_data_wait(struct sock *sk, long timeo)
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
-                if (skb_queue_len(&sk->sk_receive_queue) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) ||
+                if (!skb_queue_empty(&sk->sk_receive_queue) ||
-                                signal_pending(current) || !timeo)
+                    sk->sk_err ||
+                    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+                    signal_pending(current) ||
+                    !timeo)
                        break;
                set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 6d689200bcf3..6304590fd36a 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -781,7 +781,7 @@ static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
        BT_DBG("tty %p dev %p", tty, dev);
-        if (skb_queue_len(&dlc->tx_queue))
+        if (!skb_queue_empty(&dlc->tx_queue))
                return dlc->mtu;
        return 0;
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
new file mode 100644
index 000000000000..db23d59746cf
--- /dev/null
+++ b/net/bridge/Kconfig
@@ -0,0 +1,31 @@
+#
+# 802.1d Ethernet Bridging
+#
+config BRIDGE
+        tristate "802.1d Ethernet Bridging"
+        ---help---
+          If you say Y here, then your Linux box will be able to act as an
+          Ethernet bridge, which means that the different Ethernet segments it
+          is connected to will appear as one Ethernet to the participants.
+          Several such bridges can work together to create even larger
+          networks of Ethernets using the IEEE 802.1 spanning tree algorithm.
+          As this is a standard, Linux bridges will cooperate properly with
+          other third party bridge products.
+          In order to use the Ethernet bridge, you'll need the bridge
+          configuration tools; see <file:Documentation/networking/bridge.txt>
+          for location. Please read the Bridge mini-HOWTO for more
+          information.
+          If you enable iptables support along with the bridge support then you
+          turn your bridge into a bridging IP firewall.
+          iptables will then see the IP packets being bridged, so you need to
+          take this into account when setting up your firewall rules.
+          Enabling arptables support when bridging will let arptables see
+          bridged ARP traffic in the arptables FORWARD chain.
+          To compile this code as a module, choose M here: the module
+          will be called bridge.
+          If unsure, say N.
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 03ae4edddac3..2d52fee63a8c 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -844,7 +844,7 @@ static unsigned int ip_sabotage_out(unsigned int hook, struct sk_buff **pskb,
                 * doesn't use the bridge parent of the indev by using
                 * the BRNF_DONT_TAKE_PARENT mask. */
                if (hook == NF_IP_FORWARD && nf_bridge->physindev == NULL) {
-                        nf_bridge->mask &= BRNF_DONT_TAKE_PARENT;
+                        nf_bridge->mask |= BRNF_DONT_TAKE_PARENT;
                        nf_bridge->physindev = (struct net_device *)in;
                }
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index e4ae34b88925..662975be3d1d 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -61,8 +61,6 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
 {
        struct ebt_log_info *info = (struct ebt_log_info *)data;
        char level_string[4] = "< >";
-        union {struct iphdr iph; struct tcpudphdr ports;
-               struct arphdr arph; struct arppayload arpp;} u;
        level_string[1] = '0' + info->loglevel;
        spin_lock_bh(&ebt_log_lock);
@@ -88,7 +86,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
                }
                printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,",
                   NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
-                printk(" IP tos=0x%02X, IP proto=%d", u.iph.tos,
+                printk(" IP tos=0x%02X, IP proto=%d", ih->tos,
                       ih->protocol);
                if (ih->protocol == IPPROTO_TCP ||
                    ih->protocol == IPPROTO_UDP) {
@@ -127,7 +125,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr,
                    ah->ar_pln == sizeof(uint32_t)) {
                        struct arppayload _arpp, *ap;
-                        ap = skb_header_pointer(skb, sizeof(u.arph),
+                        ap = skb_header_pointer(skb, sizeof(_arph),
                                                sizeof(_arpp), &_arpp);
                        if (ap == NULL) {
                                printk(" INCOMPLETE ARP payload");
diff --git a/net/core/dev.c b/net/core/dev.c
index ab935778ce81..ff9dc029233a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,18 +115,6 @@
 #endif  /* CONFIG_NET_RADIO */
 #include <asm/current.h>
-/* This define, if set, will randomly drop a packet when congestion
- * is more than moderate.  It helps fairness in the multi-interface
- * case when one of them is a hog, but it kills performance for the
- * single interface case so it is off now by default.
- */
-#undef RAND_LIE
-/* Setting this will sample the queue lengths and thus congestion
- * via a timer instead of as each packet is received.
- */
-#undef OFFLINE_SAMPLE
 /*
 *      The list of packet types we will receive (as opposed to discard)
 *      and the routines to invoke.
@@ -159,11 +147,6 @@ static DEFINE_SPINLOCK(ptype_lock);
 static struct list_head ptype_base[16]; /* 16 way hashed list */
 static struct list_head ptype_all;              /* Taps */
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy);
-static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
-#endif
 /*
 * The @dev_base list is protected by @dev_base_lock and the rtln
 * semaphore.
@@ -215,7 +198,7 @@ static struct notifier_block *netdev_chain;
 *      Device drivers call our routines to queue packets here. We empty the
 *      queue in the local softnet handler.
 */
-DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, };
+DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
 #ifdef CONFIG_SYSFS
 extern int netdev_sysfs_init(void);
@@ -1144,7 +1127,7 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 extern void skb_release_data(struct sk_buff *);
 /* Keep head the same: replace data */
-int __skb_linearize(struct sk_buff *skb, int gfp_mask)
+int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
        unsigned int size;
        u8 *data;
@@ -1363,71 +1346,13 @@ out:
                        Receiver routines
  =======================================================================*/
-int netdev_max_backlog = 300;
+int netdev_max_backlog = 1000;
+int netdev_budget = 300;
 int weight_p = 64;            /* old backlog weight */
-/* These numbers are selected based on intuition and some
- * experimentatiom, if you have more scientific way of doing this
- * please go ahead and fix things.
- */
-int no_cong_thresh = 10;
-int no_cong = 20;
-int lo_cong = 100;
-int mod_cong = 290;
 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
-static void get_sample_stats(int cpu)
-{
-#ifdef RAND_LIE
-        unsigned long rd;
-        int rq;
-#endif
-        struct softnet_data *sd = &per_cpu(softnet_data, cpu);
-        int blog = sd->input_pkt_queue.qlen;
-        int avg_blog = sd->avg_blog;
-        avg_blog = (avg_blog >> 1) + (blog >> 1);
-        if (avg_blog > mod_cong) {
-                /* Above moderate congestion levels. */
-                sd->cng_level = NET_RX_CN_HIGH;
-#ifdef RAND_LIE
-                rd = net_random();
-                rq = rd % netdev_max_backlog;
-                if (rq < avg_blog) /* unlucky bastard */
-                        sd->cng_level = NET_RX_DROP;
-#endif
-        } else if (avg_blog > lo_cong) {
-                sd->cng_level = NET_RX_CN_MOD;
-#ifdef RAND_LIE
-                rd = net_random();
-                rq = rd % netdev_max_backlog;
-                        if (rq < avg_blog) /* unlucky bastard */
-                                sd->cng_level = NET_RX_CN_HIGH;
-#endif
-        } else if (avg_blog > no_cong)
-                sd->cng_level = NET_RX_CN_LOW;
-        else  /* no congestion */
-                sd->cng_level = NET_RX_SUCCESS;
-        sd->avg_blog = avg_blog;
-}
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy)
-{
-/* 10 ms 0r 1ms -- i don't care -- JHS */
-        int next_tick = 1;
-        int cpu = smp_processor_id();
-        get_sample_stats(cpu);
-        next_tick += jiffies;
-        mod_timer(&samp_timer, next_tick);
-}
-#endif
 /**
 *      netif_rx        -       post buffer to the network code
 *      @skb: buffer to post
@@ -1448,7 +1373,6 @@ static void sample_queue(unsigned long dummy)
 int netif_rx(struct sk_buff *skb)
 {
-        int this_cpu;
        struct softnet_data *queue;
        unsigned long flags;
@@ -1464,38 +1388,22 @@ int netif_rx(struct sk_buff *skb)
         * short when CPU is congested, but is still operating.
         */
        local_irq_save(flags);
-        this_cpu = smp_processor_id();
        queue = &__get_cpu_var(softnet_data);
        __get_cpu_var(netdev_rx_stat).total++;
        if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
                if (queue->input_pkt_queue.qlen) {
-                        if (queue->throttle)
-                                goto drop;
 enqueue:
                        dev_hold(skb->dev);
                        __skb_queue_tail(&queue->input_pkt_queue, skb);
-#ifndef OFFLINE_SAMPLE
-                        get_sample_stats(this_cpu);
-#endif
                        local_irq_restore(flags);
-                        return queue->cng_level;
+                        return NET_RX_SUCCESS;
                }
-                if (queue->throttle)
-                        queue->throttle = 0;
                netif_rx_schedule(&queue->backlog_dev);
                goto enqueue;
        }
-        if (!queue->throttle) {
-                queue->throttle = 1;
-                __get_cpu_var(netdev_rx_stat).throttled++;
-        }
-drop:
        __get_cpu_var(netdev_rx_stat).dropped++;
        local_irq_restore(flags);
@@ -1780,8 +1688,6 @@ job_done:
        smp_mb__before_clear_bit();
        netif_poll_enable(backlog_dev);
-        if (queue->throttle)
-                queue->throttle = 0;
        local_irq_enable();
        return 0;
 }
@@ -1790,8 +1696,7 @@ static void net_rx_action(struct softirq_action *h)
 {
        struct softnet_data *queue = &__get_cpu_var(softnet_data);
        unsigned long start_time = jiffies;
-        int budget = netdev_max_backlog;
+        int budget = netdev_budget;
        
        local_irq_disable();
@@ -2055,15 +1960,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
        struct netif_rx_stats *s = v;
        seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
-                   s->total, s->dropped, s->time_squeeze, s->throttled,
+                   s->total, s->dropped, s->time_squeeze, 0,
-                   s->fastroute_hit, s->fastroute_success, s->fastroute_defer,
+                   0, 0, 0, 0, /* was fastroute */
-                   s->fastroute_deferred_out,
+                   s->cpu_collision );
-#if 0
-                   s->fastroute_latency_reduction
-#else
-                   s->cpu_collision
-#endif
-                  );
        return 0;
 }
@@ -2190,10 +2089,11 @@ void dev_set_promiscuity(struct net_device *dev, int inc)
 {
        unsigned short old_flags = dev->flags;
-        dev->flags |= IFF_PROMISC;
        if ((dev->promiscuity += inc) == 0)
                dev->flags &= ~IFF_PROMISC;
-        if (dev->flags ^ old_flags) {
+        else
+                dev->flags |= IFF_PROMISC;
+        if (dev->flags != old_flags) {
                dev_mc_upload(dev);
                printk(KERN_INFO "device %s %s promiscuous mode\n",
                       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
@@ -3305,9 +3205,6 @@ static int __init net_dev_init(void)
                queue = &per_cpu(softnet_data, i);
                skb_queue_head_init(&queue->input_pkt_queue);
-                queue->throttle = 0;
-                queue->cng_level = 0;
-                queue->avg_blog = 10; /* arbitrary non-zero */
                queue->completion_queue = NULL;
                INIT_LIST_HEAD(&queue->poll_list);
                set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
@@ -3316,11 +3213,6 @@ static int __init net_dev_init(void)
                atomic_set(&queue->backlog_dev.refcnt, 1);
        }
-#ifdef OFFLINE_SAMPLE
-        samp_timer.expires = jiffies + (10 * HZ);
-        add_timer(&samp_timer);
-#endif
        dev_boot_phase = 0;
        open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
diff --git a/net/core/filter.c b/net/core/filter.c
index f3b88205ace2..cd91a24f9720 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -36,7 +36,7 @@
 #include <linux/filter.h>
 /* No hurry in this branch */
-static u8 *load_pointer(struct sk_buff *skb, int k)
+static void *__load_pointer(struct sk_buff *skb, int k)
 {
        u8 *ptr = NULL;
@@ -50,6 +50,18 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
        return NULL;
 }
+static inline void *load_pointer(struct sk_buff *skb, int k,
+                                 unsigned int size, void *buffer)
+{
+        if (k >= 0)
+                return skb_header_pointer(skb, k, size, buffer);
+        else {
+                if (k >= SKF_AD_OFF)
+                        return NULL;
+                return __load_pointer(skb, k);
+        }
+}
 /**
 *      sk_run_filter   -       run a filter on a socket
 *      @skb: buffer to run the filter on
@@ -64,15 +76,12 @@ static u8 *load_pointer(struct sk_buff *skb, int k)
 
 int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
 {
-        unsigned char *data = skb->data;
-        /* len is UNSIGNED. Byte wide insns relies only on implicit
-           type casts to prevent reading arbitrary memory locations.
-         */
-        unsigned int len = skb->len-skb->data_len;
        struct sock_filter *fentry;     /* We walk down these */
+        void *ptr;
        u32 A = 0;                      /* Accumulator */
        u32 X = 0;                      /* Index Register */
        u32 mem[BPF_MEMWORDS];          /* Scratch Memory Store */
+        u32 tmp;
        int k;
        int pc;
@@ -168,86 +177,35 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
                case BPF_LD|BPF_W|BPF_ABS:
                        k = fentry->k;
 load_w:
-                        if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) {
+                        ptr = load_pointer(skb, k, 4, &tmp);
-                                A = ntohl(*(u32*)&data[k]);
+                        if (ptr != NULL) {
+                                A = ntohl(*(u32 *)ptr);
                                continue;
                        }
-                        if (k < 0) {
-                                u8 *ptr;
-                                if (k >= SKF_AD_OFF)
-                                        break;
-                                ptr = load_pointer(skb, k);
-                                if (ptr) {
-                                        A = ntohl(*(u32*)ptr);
-                                        continue;
-                                }
-                        } else {
-                                u32 _tmp, *p;
-                                p = skb_header_pointer(skb, k, 4, &_tmp);
-                                if (p != NULL) {
-                                        A = ntohl(*p);
-                                        continue;
-                                }
-                        }
                        return 0;
                case BPF_LD|BPF_H|BPF_ABS:
                        k = fentry->k;
 load_h:
-                        if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) {
+                        ptr = load_pointer(skb, k, 2, &tmp);
-                                A = ntohs(*(u16*)&data[k]);
+                        if (ptr != NULL) {
+                                A = ntohs(*(u16 *)ptr);
                                continue;
                        }
-                        if (k < 0) {
-                                u8 *ptr;
-                                if (k >= SKF_AD_OFF)
-                                        break;
-                                ptr = load_pointer(skb, k);
-                                if (ptr) {
-                                        A = ntohs(*(u16*)ptr);
-                                        continue;
-                                }
-                        } else {
-                                u16 _tmp, *p;
-                                p = skb_header_pointer(skb, k, 2, &_tmp);
-                                if (p != NULL) {
-                                        A = ntohs(*p);
-                                        continue;
-                                }
-                        }
                        return 0;
                case BPF_LD|BPF_B|BPF_ABS:
                        k = fentry->k;
 load_b:
-                        if (k >= 0 && (unsigned int)k < len) {
+                        ptr = load_pointer(skb, k, 1, &tmp);
-                                A = data[k];
+                        if (ptr != NULL) {
+                                A = *(u8 *)ptr;
                                continue;
                        }
-                        if (k < 0) {
-                                u8 *ptr;
-                                if (k >= SKF_AD_OFF)
-                                        break;
-                                ptr = load_pointer(skb, k);
-                                if (ptr) {
-                                        A = *ptr;
-                                        continue;
-                                }
-                        } else {
-                                u8 _tmp, *p;
-                                p = skb_header_pointer(skb, k, 1, &_tmp);
-                                if (p != NULL) {
-                                        A = *p;
-                                        continue;
-                                }
-                        }
                        return 0;
                case BPF_LD|BPF_W|BPF_LEN:
-                        A = len;
+                        A = skb->len;
                        continue;
                case BPF_LDX|BPF_W|BPF_LEN:
-                        X = len;
+                        X = skb->len;
                        continue;
                case BPF_LD|BPF_W|BPF_IND:
                        k = X + fentry->k;
@@ -259,10 +217,12 @@ load_b:
                        k = X + fentry->k;
                        goto load_b;
                case BPF_LDX|BPF_B|BPF_MSH:
-                        if (fentry->k >= len)
+                        ptr = load_pointer(skb, fentry->k, 1, &tmp);
-                                return 0;
+                        if (ptr != NULL) {
-                        X = (data[fentry->k] & 0xf) << 2;
+                                X = (*(u8 *)ptr & 0xf) << 2;
-                        continue;
+                                continue;
+                        }
+                        return 0;
                case BPF_LD|BPF_IMM:
                        A = fentry->k;
                        continue;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 851eb927ed97..1beb782ac41b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1598,6 +1598,8 @@ static int neightbl_fill_info(struct neigh_table *tbl, struct sk_buff *skb,
        read_lock_bh(&tbl->lock);
        ndtmsg->ndtm_family = tbl->family;
+        ndtmsg->ndtm_pad1   = 0;
+        ndtmsg->ndtm_pad2   = 0;
        RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
        RTA_PUT_MSECS(skb, NDTA_GC_INTERVAL, tbl->gc_interval);
@@ -1683,6 +1685,8 @@ static int neightbl_fill_param_info(struct neigh_table *tbl,
        read_lock_bh(&tbl->lock);
        ndtmsg->ndtm_family = tbl->family;
+        ndtmsg->ndtm_pad1   = 0;
+        ndtmsg->ndtm_pad2   = 0;
        RTA_PUT_STRING(skb, NDTA_NAME, tbl->id);
        if (neightbl_fill_parms(skb, parms) < 0)
@@ -1872,6 +1876,8 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n,
        struct ndmsg *ndm = NLMSG_DATA(nlh);
        ndm->ndm_family  = n->ops->family;
+        ndm->ndm_pad1    = 0;
+        ndm->ndm_pad2    = 0;
        ndm->ndm_flags   = n->flags;
        ndm->ndm_type    = n->type;
        ndm->ndm_ifindex = n->dev->ifindex;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index c57b06bc79f3..975d651312dc 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -151,7 +151,7 @@
 #include <asm/timex.h>
-#define VERSION  "pktgen v2.61: Packet Generator for packet performance testing.\n"
+#define VERSION  "pktgen v2.62: Packet Generator for packet performance testing.\n"
 /* #define PG_DEBUG(a) a */
 #define PG_DEBUG(a) 
@@ -1921,6 +1921,11 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
        struct iphdr *iph;
        struct pktgen_hdr *pgh = NULL;
        
+        /* Update any of the values, used when we're incrementing various
+         * fields.
+         */
+        mod_cur_headers(pkt_dev);
        skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
        if (!skb) {
                sprintf(pkt_dev->result, "No memory");
@@ -1934,11 +1939,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
        iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr));
        udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
-        /* Update any of the values, used when we're incrementing various
-         * fields.
-         */
-        mod_cur_headers(pkt_dev);
        memcpy(eth, pkt_dev->hh, 12);
        *(u16*)&eth[12] = __constant_htons(ETH_P_IP);
@@ -2192,7 +2192,12 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
        int datalen;
        struct ipv6hdr *iph;
        struct pktgen_hdr *pgh = NULL;
-        
+        /* Update any of the values, used when we're incrementing various
+         * fields.
+         */
+        mod_cur_headers(pkt_dev);
        skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC);
        if (!skb) {
                sprintf(pkt_dev->result, "No memory");
@@ -2206,17 +2211,9 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
        iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr));
        udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr));
-        /* Update any of the values, used when we're incrementing various
-         * fields.
-         */
-        mod_cur_headers(pkt_dev);
-        
        memcpy(eth, pkt_dev->hh, 12);
        *(u16*)&eth[12] = __constant_htons(ETH_P_IPV6);
-        
-        
        datalen = pkt_dev->cur_pkt_size-14- 
                sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e013d836a7ab..4b1bb30e6381 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -126,6 +126,7 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data
        rta->rta_type = attrtype;
        rta->rta_len = size;
        memcpy(RTA_DATA(rta), data, attrlen);
+        memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
 }
 size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
@@ -188,6 +189,7 @@ static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
        nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*r), flags);
        r = NLMSG_DATA(nlh);
        r->ifi_family = AF_UNSPEC;
+        r->__ifi_pad = 0;
        r->ifi_type = dev->type;
        r->ifi_index = dev->ifindex;
        r->ifi_flags = dev_get_flags(dev);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6d68c03bc051..d9f7b06fe886 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -129,7 +129,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
 *      Buffers may only be allocated from interrupts using a @gfp_mask of
 *      %GFP_ATOMIC.
 */
-struct sk_buff *alloc_skb(unsigned int size, int gfp_mask)
+struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask)
 {
        struct sk_buff *skb;
        u8 *data;
@@ -182,7 +182,8 @@ nodata:
 *      %GFP_ATOMIC.
 */
 struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
-                                     unsigned int size, int gfp_mask)
+                                     unsigned int size,
+                                     unsigned int __nocast gfp_mask)
 {
        struct sk_buff *skb;
        u8 *data;
@@ -322,7 +323,7 @@ void __kfree_skb(struct sk_buff *skb)
 *      %GFP_ATOMIC.
 */
-struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
+struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
        struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
@@ -357,7 +358,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
        C(ip_summed);
        C(priority);
        C(protocol);
-        C(security);
        n->destructor = NULL;
 #ifdef CONFIG_NETFILTER
        C(nfmark);
@@ -422,7 +422,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
        new->pkt_type   = old->pkt_type;
        new->stamp      = old->stamp;
        new->destructor = NULL;
-        new->security   = old->security;
 #ifdef CONFIG_NETFILTER
        new->nfmark     = old->nfmark;
        new->nfcache    = old->nfcache;
@@ -462,7 +461,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 *      header is going to be modified. Use pskb_copy() instead.
 */
-struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
+struct sk_buff *skb_copy(const struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
        int headerlen = skb->data - skb->head;
        /*
@@ -501,7 +500,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
 *      The returned buffer has a reference count of 1.
 */
-struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask)
+struct sk_buff *pskb_copy(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
        /*
         *      Allocate the copy buffer
@@ -559,7 +558,8 @@ out:
 *      reloaded after call to this function.
 */
-int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask)
+int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+                     unsigned int __nocast gfp_mask)
 {
        int i;
        u8 *data;
@@ -649,7 +649,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
 *      only by netfilter in the cases when checksum is recalculated? --ANK
 */
 struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
-                                int newheadroom, int newtailroom, int gfp_mask)
+                                int newheadroom, int newtailroom,
+                                unsigned int __nocast gfp_mask)
 {
        /*
         *      Allocate the copy buffer
@@ -1500,6 +1501,159 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
                skb_split_no_header(skb, skb1, len, pos);
 }
+/**
+ * skb_prepare_seq_read - Prepare a sequential read of skb data
+ * @skb: the buffer to read
+ * @from: lower offset of data to be read
+ * @to: upper offset of data to be read
+ * @st: state variable
+ *
+ * Initializes the specified state variable. Must be called before
+ * invoking skb_seq_read() for the first time.
+ */
+void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
+                          unsigned int to, struct skb_seq_state *st)
+{
+        st->lower_offset = from;
+        st->upper_offset = to;
+        st->root_skb = st->cur_skb = skb;
+        st->frag_idx = st->stepped_offset = 0;
+        st->frag_data = NULL;
+}
+/**
+ * skb_seq_read - Sequentially read skb data
+ * @consumed: number of bytes consumed by the caller so far
+ * @data: destination pointer for data to be returned
+ * @st: state variable
+ *
+ * Reads a block of skb data at &consumed relative to the
+ * lower offset specified to skb_prepare_seq_read(). Assigns
+ * the head of the data block to &data and returns the length
+ * of the block or 0 if the end of the skb data or the upper
+ * offset has been reached.
+ *
+ * The caller is not required to consume all of the data
+ * returned, i.e. &consumed is typically set to the number
+ * of bytes already consumed and the next call to
+ * skb_seq_read() will return the remaining part of the block.
+ *
+ * Note: The size of each block of data returned can be arbitary,
+ *       this limitation is the cost for zerocopy seqeuental
+ *       reads of potentially non linear data.
+ *
+ * Note: Fragment lists within fragments are not implemented
+ *       at the moment, state->root_skb could be replaced with
+ *       a stack for this purpose.
+ */
+unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
+                          struct skb_seq_state *st)
+{
+        unsigned int block_limit, abs_offset = consumed + st->lower_offset;
+        skb_frag_t *frag;
+        if (unlikely(abs_offset >= st->upper_offset))
+                return 0;
+next_skb:
+        block_limit = skb_headlen(st->cur_skb);
+        if (abs_offset < block_limit) {
+                *data = st->cur_skb->data + abs_offset;
+                return block_limit - abs_offset;
+        }
+        if (st->frag_idx == 0 && !st->frag_data)
+                st->stepped_offset += skb_headlen(st->cur_skb);
+        while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+                frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
+                block_limit = frag->size + st->stepped_offset;
+                if (abs_offset < block_limit) {
+                        if (!st->frag_data)
+                                st->frag_data = kmap_skb_frag(frag);
+                        *data = (u8 *) st->frag_data + frag->page_offset +
+                                (abs_offset - st->stepped_offset);
+                        return block_limit - abs_offset;
+                }
+                if (st->frag_data) {
+                        kunmap_skb_frag(st->frag_data);
+                        st->frag_data = NULL;
+                }
+                st->frag_idx++;
+                st->stepped_offset += frag->size;
+        }
+        if (st->cur_skb->next) {
+                st->cur_skb = st->cur_skb->next;
+                st->frag_idx = 0;
+                goto next_skb;
+        } else if (st->root_skb == st->cur_skb &&
+                   skb_shinfo(st->root_skb)->frag_list) {
+                st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
+                goto next_skb;
+        }
+        return 0;
+}
+/**
+ * skb_abort_seq_read - Abort a sequential read of skb data
+ * @st: state variable
+ *
+ * Must be called if skb_seq_read() was not called until it
+ * returned 0.
+ */
+void skb_abort_seq_read(struct skb_seq_state *st)
+{
+        if (st->frag_data)
+                kunmap_skb_frag(st->frag_data);
+}
+#define TS_SKB_CB(state)        ((struct skb_seq_state *) &((state)->cb))
+static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
+                                          struct ts_config *conf,
+                                          struct ts_state *state)
+{
+        return skb_seq_read(offset, text, TS_SKB_CB(state));
+}
+static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
+{
+        skb_abort_seq_read(TS_SKB_CB(state));
+}
+/**
+ * skb_find_text - Find a text pattern in skb data
+ * @skb: the buffer to look in
+ * @from: search offset
+ * @to: search limit
+ * @config: textsearch configuration
+ * @state: uninitialized textsearch state variable
+ *
+ * Finds a pattern in the skb data according to the specified
+ * textsearch configuration. Use textsearch_next() to retrieve
+ * subsequent occurrences of the pattern. Returns the offset
+ * to the first occurrence or UINT_MAX if no match was found.
+ */
+unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
+                           unsigned int to, struct ts_config *config,
+                           struct ts_state *state)
+{
+        config->get_next_block = skb_ts_get_next_block;
+        config->finish = skb_ts_finish;
+        skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
+        return textsearch_find(config, state);
+}
 void __init skb_init(void)
 {
        skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
@@ -1538,3 +1692,7 @@ EXPORT_SYMBOL(skb_queue_tail);
 EXPORT_SYMBOL(skb_unlink);
 EXPORT_SYMBOL(skb_append);
 EXPORT_SYMBOL(skb_split);
+EXPORT_SYMBOL(skb_prepare_seq_read);
+EXPORT_SYMBOL(skb_seq_read);
+EXPORT_SYMBOL(skb_abort_seq_read);
+EXPORT_SYMBOL(skb_find_text);
diff --git a/net/core/sock.c b/net/core/sock.c
index a6ec3ada7f9e..8b35ccdc2b3b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -622,7 +622,8 @@ lenout:
 *      @prot: struct proto associated with this new sock instance
 *      @zero_it: if we should zero the newly allocated sock
 */
-struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it)
+struct sock *sk_alloc(int family, unsigned int __nocast priority,
+                      struct proto *prot, int zero_it)
 {
        struct sock *sk = NULL;
        kmem_cache_t *slab = prot->slab;
@@ -750,7 +751,8 @@ unsigned long sock_i_ino(struct sock *sk)
 /*
 * Allocate a skb from the socket's send buffer.
 */
-struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority)
+struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
+                             unsigned int __nocast priority)
 {
        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
                struct sk_buff * skb = alloc_skb(size, priority);
@@ -765,7 +767,8 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int
 /*
 * Allocate a skb from the socket's receive buffer.
 */ 
-struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority)
+struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
+                             unsigned int __nocast priority)
 {
        if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
                struct sk_buff *skb = alloc_skb(size, priority);
@@ -780,7 +783,7 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int
 /* 
 * Allocate a memory block from the socket's option memory buffer.
 */ 
-void *sock_kmalloc(struct sock *sk, int size, int priority)
+void *sock_kmalloc(struct sock *sk, int size, unsigned int __nocast priority)
 {
        if ((unsigned)size <= sysctl_optmem_max &&
            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 880a88815211..8f817ad9f546 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -13,12 +13,8 @@
 #ifdef CONFIG_SYSCTL
 extern int netdev_max_backlog;
+extern int netdev_budget;
 extern int weight_p;
-extern int no_cong_thresh;
-extern int no_cong;
-extern int lo_cong;
-extern int mod_cong;
-extern int netdev_fastroute;
 extern int net_msg_cost;
 extern int net_msg_burst;
@@ -86,38 +82,6 @@ ctl_table core_table[] = {
                .proc_handler   = &proc_dointvec
        },
        {
-                .ctl_name       = NET_CORE_NO_CONG_THRESH,
-                .procname       = "no_cong_thresh",
-                .data           = &no_cong_thresh,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec
-        },
-        {
-                .ctl_name       = NET_CORE_NO_CONG,
-                .procname       = "no_cong",
-                .data           = &no_cong,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec
-        },
-        {
-                .ctl_name       = NET_CORE_LO_CONG,
-                .procname       = "lo_cong",
-                .data           = &lo_cong,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec
-        },
-        {
-                .ctl_name       = NET_CORE_MOD_CONG,
-                .procname       = "mod_cong",
-                .data           = &mod_cong,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec
-        },
-        {
                .ctl_name       = NET_CORE_MSG_COST,
                .procname       = "message_cost",
                .data           = &net_msg_cost,
@@ -161,6 +125,14 @@ ctl_table core_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec
        },
+        {
+                .ctl_name       = NET_CORE_BUDGET,
+                .procname       = "netdev_budget",
+                .data           = &netdev_budget,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = &proc_dointvec
+        },
        { .ctl_name = 0 }
 };
diff --git a/net/core/wireless.c b/net/core/wireless.c
index b2fe378dfbf8..3ff5639c0b78 100644
--- a/net/core/wireless.c
+++ b/net/core/wireless.c
@@ -1102,6 +1102,7 @@ static inline int rtnetlink_fill_iwinfo(struct sk_buff *	skb,
        nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r));
        r = NLMSG_DATA(nlh);
        r->ifi_family = AF_UNSPEC;
+        r->__ifi_pad = 0;
        r->ifi_type = dev->type;
        r->ifi_index = dev->ifindex;
        r->ifi_flags = dev->flags;
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
index 2101da542ba8..92f2ec46fd22 100644
--- a/net/decnet/Kconfig
+++ b/net/decnet/Kconfig
@@ -1,6 +1,29 @@
 #
 # DECnet configuration
 #
+config DECNET
+        tristate "DECnet Support"
+        ---help---
+          The DECnet networking protocol was used in many products made by
+          Digital (now Compaq).  It provides reliable stream and sequenced
+          packet communications over which run a variety of services similar
+          to those which run over TCP/IP.
+          To find some tools to use with the kernel layer support, please
+          look at Patrick Caulfield's web site:
+          <http://linux-decnet.sourceforge.net/>.
+          More detailed documentation is available in
+          <file:Documentation/networking/decnet.txt>.
+          Be sure to say Y to "/proc file system support" and "Sysctl support"
+          below when using DECnet, since you will need sysctl support to aid
+          in configuration at run time.
+          The DECnet code is also available as a module ( = code which can be
+          inserted in and removed from the running kernel whenever you want).
+          The module is called decnet.
 config DECNET_ROUTER
        bool "DECnet: router support (EXPERIMENTAL)"
        depends on DECNET && EXPERIMENTAL
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 29bb3cd21965..96a02800cd28 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -536,7 +536,7 @@ static void dn_keepalive(struct sock *sk)
         * we are double checking that we are not sending too
         * many of these keepalive frames.
         */
-        if (skb_queue_len(&scp->other_xmit_queue) == 0)
+        if (skb_queue_empty(&scp->other_xmit_queue))
                dn_nsp_send_link(sk, DN_NOCHANGE, 0);
 }
@@ -1191,7 +1191,7 @@ static unsigned int dn_poll(struct file *file, struct socket *sock, poll_table
        struct dn_scp *scp = DN_SK(sk);
        int mask = datagram_poll(file, sock, wait);
-        if (skb_queue_len(&scp->other_receive_queue))
+        if (!skb_queue_empty(&scp->other_receive_queue))
                mask |= POLLRDBAND;
        return mask;
@@ -1214,7 +1214,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
        case SIOCATMARK:
                lock_sock(sk);
-                val = (skb_queue_len(&scp->other_receive_queue) != 0);
+                val = !skb_queue_empty(&scp->other_receive_queue);
                if (scp->state != DN_RUN)
                        val = -ENOTCONN;
                release_sock(sk);
@@ -1630,7 +1630,7 @@ static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int
        int len = 0;
        if (flags & MSG_OOB)
-                return skb_queue_len(q) ? 1 : 0;
+                return !skb_queue_empty(q) ? 1 : 0;
        while(skb != (struct sk_buff *)q) {
                struct dn_skb_cb *cb = DN_SKB_CB(skb);
@@ -1707,7 +1707,7 @@ static int dn_recvmsg(struct kiocb *iocb, struct socket *sock,
                if (sk->sk_err)
                        goto out;
-                if (skb_queue_len(&scp->other_receive_queue)) {
+                if (!skb_queue_empty(&scp->other_receive_queue)) {
                        if (!(flags & MSG_OOB)) {
                                msg->msg_flags |= MSG_OOB;
                                if (!scp->other_report) {
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 9934b25720e4..99bc061759c3 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -551,7 +551,8 @@ int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
                if (t < s_t)
                        continue;
                if (t > s_t)
-                        memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int));
+                        memset(&cb->args[1], 0,
+                               sizeof(cb->args) - sizeof(cb->args[0]));
                tb = dn_fib_get_table(t, 0);
                if (tb == NULL)
                        continue;
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 42abbf3f524f..8cce1fdbda90 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -342,7 +342,8 @@ int dn_nsp_xmit_timeout(struct sock *sk)
        dn_nsp_output(sk);
-        if (skb_queue_len(&scp->data_xmit_queue) || skb_queue_len(&scp->other_xmit_queue))
+        if (!skb_queue_empty(&scp->data_xmit_queue) ||
+            !skb_queue_empty(&scp->other_xmit_queue))
                scp->persist = dn_nsp_persist(sk);
        return 0;
diff --git a/net/econet/Kconfig b/net/econet/Kconfig
new file mode 100644
index 000000000000..39a2d2975e0e
--- /dev/null
+++ b/net/econet/Kconfig
@@ -0,0 +1,36 @@
+#
+# Acorn Econet/AUN protocols 
+#
+config ECONET
+        tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)"
+        depends on EXPERIMENTAL && INET
+        ---help---
+          Econet is a fairly old and slow networking protocol mainly used by
+          Acorn computers to access file and print servers. It uses native
+          Econet network cards. AUN is an implementation of the higher level
+          parts of Econet that runs over ordinary Ethernet connections, on
+          top of the UDP packet protocol, which in turn runs on top of the
+          Internet protocol IP.
+          If you say Y here, you can choose with the next two options whether
+          to send Econet/AUN traffic over a UDP Ethernet connection or over
+          a native Econet network card.
+          To compile this driver as a module, choose M here: the module
+          will be called econet.
+config ECONET_AUNUDP
+        bool "AUN over UDP"
+        depends on ECONET
+        help
+          Say Y here if you want to send Econet/AUN traffic over a UDP
+          connection (UDP is a packet based protocol that runs on top of the
+          Internet protocol IP) using an ordinary Ethernet network card.
+config ECONET_NATIVE
+        bool "Native Econet"
+        depends on ECONET
+        help
+          Say Y here if you have a native Econet network card installed in
+          your computer.
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 6617ea47d365..f6dbfb99b14d 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -92,10 +92,9 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
         *      Set the source hardware address. 
         */
         
-        if(saddr)
+        if(!saddr)
-                memcpy(eth->h_source,saddr,dev->addr_len);
+                saddr = dev->dev_addr;
-        else
+        memcpy(eth->h_source,saddr,dev->addr_len);
-                memcpy(eth->h_source,dev->dev_addr,dev->addr_len);
        /*
         *      Anyway, the loopback-device should never use this function... 
@@ -156,7 +155,7 @@ int eth_rebuild_header(struct sk_buff *skb)
 *      This is normal practice and works for any 'now in use' protocol.
 */
 
-unsigned short eth_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
        struct ethhdr *eth;
        unsigned char *rawp;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 567b03b1c349..df5386885a90 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -1,35 +1,8 @@
 #
 # IP configuration
 #
-choice 
-        prompt "Choose IP: FIB lookup"
-        depends on INET
-        default IP_FIB_HASH
-config IP_FIB_HASH
-        bool "FIB_HASH"
-        ---help---
-        Current FIB is very proven and good enough for most users.
-config IP_FIB_TRIE
-        bool "FIB_TRIE"
-        ---help---
-        Use new experimental LC-trie as FIB lookup algoritm. 
-        This improves lookup performance
-        
-        LC-trie is described in:
-        
-        IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
-        IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
-        An experimental study of compression methods for dynamic tries
-        Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
-        http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
-       
-endchoice
 config IP_MULTICAST
        bool "IP: multicasting"
-        depends on INET
        help
          This is code for addressing several networked computers at once,
          enlarging your kernel by about 2 KB. You need multicasting if you
@@ -43,7 +16,6 @@ config IP_MULTICAST
 config IP_ADVANCED_ROUTER
        bool "IP: advanced router"
-        depends on INET
        ---help---
          If you intend to run your Linux box mostly as a router, i.e. as a
          computer that forwards and redistributes network packets, say Y; you
@@ -79,6 +51,44 @@ config IP_ADVANCED_ROUTER
          If unsure, say N here.
+choice 
+        prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
+        depends on IP_ADVANCED_ROUTER
+        default IP_FIB_HASH
+config IP_FIB_HASH
+        bool "FIB_HASH"
+        ---help---
+        Current FIB is very proven and good enough for most users.
+config IP_FIB_TRIE
+        bool "FIB_TRIE"
+        ---help---
+        Use new experimental LC-trie as FIB lookup algoritm. 
+        This improves lookup performance if you have a large
+        number of routes.
+        LC-trie is a longest matching prefix lookup algorithm which
+        performs better than FIB_HASH for large routing tables.
+        But, it consumes more memory and is more complex.
+        
+        LC-trie is described in:
+        
+        IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
+        IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
+        An experimental study of compression methods for dynamic tries
+        Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
+        http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
+       
+endchoice
+# If the user does not enable advanced routing, he gets the safe
+# default of the fib-hash algorithm.
+config IP_FIB_HASH
+        bool
+        depends on !IP_ADVANCED_ROUTER
+        default y
 config IP_MULTIPLE_TABLES
        bool "IP: policy routing"
        depends on IP_ADVANCED_ROUTER
@@ -171,7 +181,6 @@ config IP_ROUTE_VERBOSE
 config IP_PNP
        bool "IP: kernel level autoconfiguration"
-        depends on INET
        help
          This enables automatic configuration of IP addresses of devices and
          of the routing table during kernel boot, based on either information
@@ -230,7 +239,6 @@ config IP_PNP_RARP
 #   bool '    IP: ARP support' CONFIG_IP_PNP_ARP                
 config NET_IPIP
        tristate "IP: tunneling"
-        depends on INET
        select INET_TUNNEL
        ---help---
          Tunneling means encapsulating data of one protocol type within
@@ -248,7 +256,6 @@ config NET_IPIP
 config NET_IPGRE
        tristate "IP: GRE tunnels over IP"
-        depends on INET
        select XFRM
        help
          Tunneling means encapsulating data of one protocol type within
@@ -307,7 +314,7 @@ config IP_PIMSM_V2
 config ARPD
        bool "IP: ARP daemon support (EXPERIMENTAL)"
-        depends on INET && EXPERIMENTAL
+        depends on EXPERIMENTAL
        ---help---
          Normally, the kernel maintains an internal cache which maps IP
          addresses to hardware addresses on the local network, so that
@@ -332,7 +339,6 @@ config ARPD
 config SYN_COOKIES
        bool "IP: TCP syncookie support (disabled per default)"
-        depends on INET
        ---help---
          Normal TCP/IP networking is open to an attack known as "SYN
          flooding". This denial-of-service attack prevents legitimate remote
@@ -369,7 +375,6 @@ config SYN_COOKIES
 config INET_AH
        tristate "IP: AH transformation"
-        depends on INET
        select XFRM
        select CRYPTO
        select CRYPTO_HMAC
@@ -382,7 +387,6 @@ config INET_AH
 config INET_ESP
        tristate "IP: ESP transformation"
-        depends on INET
        select XFRM
        select CRYPTO
        select CRYPTO_HMAC
@@ -396,7 +400,6 @@ config INET_ESP
 config INET_IPCOMP
        tristate "IP: IPComp transformation"
-        depends on INET
        select XFRM
        select INET_TUNNEL
        select CRYPTO
@@ -409,7 +412,6 @@ config INET_IPCOMP
 config INET_TUNNEL
        tristate "IP: tunnel transformation"
-        depends on INET
        select XFRM
        ---help---
          Support for generic IP tunnel transformation, which is required by
@@ -419,7 +421,6 @@ config INET_TUNNEL
 config IP_TCPDIAG
        tristate "IP: TCP socket monitoring interface"
-        depends on INET
        default y
        ---help---
          Support for TCP socket monitoring interface used by native Linux
@@ -433,5 +434,108 @@ config IP_TCPDIAG
 config IP_TCPDIAG_IPV6
        def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
+config TCP_CONG_ADVANCED
+        bool "TCP: advanced congestion control"
+        ---help---
+          Support for selection of various TCP congestion control
+          modules.
+          Nearly all users can safely say no here, and a safe default
+          selection will be made (BIC-TCP with new Reno as a fallback).
+          If unsure, say N.
+# TCP Reno is builtin (required as fallback)
+menu "TCP congestion control"
+        depends on TCP_CONG_ADVANCED
+config TCP_CONG_BIC
+        tristate "Binary Increase Congestion (BIC) control"
+        default y
+        ---help---
+        BIC-TCP is a sender-side only change that ensures a linear RTT
+        fairness under large windows while offering both scalability and
+        bounded TCP-friendliness. The protocol combines two schemes
+        called additive increase and binary search increase. When the
+        congestion window is large, additive increase with a large
+        increment ensures linear RTT fairness as well as good
+        scalability. Under small congestion windows, binary search
+        increase provides TCP friendliness.
+        See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+config TCP_CONG_WESTWOOD
+        tristate "TCP Westwood+"
+        default m
+        ---help---
+        TCP Westwood+ is a sender-side only modification of the TCP Reno
+        protocol stack that optimizes the performance of TCP congestion
+        control. It is based on end-to-end bandwidth estimation to set
+        congestion window and slow start threshold after a congestion
+        episode. Using this estimation, TCP Westwood+ adaptively sets a
+        slow start threshold and a congestion window which takes into
+        account the bandwidth used  at the time congestion is experienced.
+        TCP Westwood+ significantly increases fairness wrt TCP Reno in
+        wired networks and throughput over wireless links.
+config TCP_CONG_HTCP
+        tristate "H-TCP"
+        default m
+        ---help---
+        H-TCP is a send-side only modifications of the TCP Reno
+        protocol stack that optimizes the performance of TCP
+        congestion control for high speed network links. It uses a
+        modeswitch to change the alpha and beta parameters of TCP Reno
+        based on network conditions and in a way so as to be fair with
+        other Reno and H-TCP flows.
+config TCP_CONG_HSTCP
+        tristate "High Speed TCP"
+        depends on EXPERIMENTAL
+        default n
+        ---help---
+        Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+        A modification to TCP's congestion control mechanism for use
+        with large congestion windows. A table indicates how much to
+        increase the congestion window by when an ACK is received.
+        For more detail see http://www.icir.org/floyd/hstcp.html
+config TCP_CONG_HYBLA
+        tristate "TCP-Hybla congestion control algorithm"
+        depends on EXPERIMENTAL
+        default n
+        ---help---
+        TCP-Hybla is a sender-side only change that eliminates penalization of
+        long-RTT, large-bandwidth connections, like when satellite legs are
+        involved, expecially when sharing a common bottleneck with normal
+        terrestrial connections.
+config TCP_CONG_VEGAS
+        tristate "TCP Vegas"
+        depends on EXPERIMENTAL
+        default n
+        ---help---
+        TCP Vegas is a sender-side only change to TCP that anticipates
+        the onset of congestion by estimating the bandwidth. TCP Vegas
+        adjusts the sending rate by modifying the congestion
+        window. TCP Vegas should provide less packet loss, but it is
+        not as aggressive as TCP Reno.
+config TCP_CONG_SCALABLE
+        tristate "Scalable TCP"
+        depends on EXPERIMENTAL
+        default n
+        ---help---
+        Scalable TCP is a sender-side only change to TCP which uses a
+        MIMD congestion control algorithm which has some nice scaling
+        properties, though is known to have fairness issues.
+        See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/
+endmenu
+config TCP_CONG_BIC
+        tristate
+        depends on !TCP_CONG_ADVANCED
+        default y
 source "net/ipv4/ipvs/Kconfig"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 65d57d8e1add..5718cdb3a61e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -5,7 +5,8 @@
 obj-y     := utils.o route.o inetpeer.o protocol.o \
             ip_input.o ip_fragment.o ip_forward.o ip_options.o \
             ip_output.o ip_sockglue.o \
-             tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
+             tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
+             tcp_minisocks.o tcp_cong.o \
             datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
             sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
@@ -30,6 +31,13 @@ obj-$(CONFIG_NETFILTER)	+= netfilter/
 obj-$(CONFIG_IP_VS) += ipvs/
 obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 
 obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
+obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
+obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
+obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
+obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
+obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
+obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
+obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
                      xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 658e7977924d..ef7468376ae6 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1009,6 +1009,15 @@ static int __init init_ipv4_mibs(void)
 static int ipv4_proc_init(void);
 extern void ipfrag_init(void);
+/*
+ *      IP protocol layer initialiser
+ */
+static struct packet_type ip_packet_type = {
+        .type = __constant_htons(ETH_P_IP),
+        .func = ip_rcv,
+};
 static int __init inet_init(void)
 {
        struct sk_buff *dummy_skb;
@@ -1102,6 +1111,8 @@ static int __init inet_init(void)
        ipfrag_init();
+        dev_add_pack(&ip_packet_type);
        rc = 0;
 out:
        return rc;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 0671569ee6f0..4be234c7d8c3 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -43,7 +43,7 @@
 *              2 of the License, or (at your option) any later version.
 */
-#define VERSION "0.323"
+#define VERSION "0.325"
 #include <linux/config.h>
 #include <asm/uaccess.h>
@@ -136,6 +136,7 @@ struct trie_use_stats {
        unsigned int semantic_match_passed;
        unsigned int semantic_match_miss;
        unsigned int null_node_hit;
+        unsigned int resize_node_skipped;
 };
 #endif
@@ -164,8 +165,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
 static int tnode_child_length(struct tnode *tn);
 static struct node *resize(struct trie *t, struct tnode *tn);
-static struct tnode *inflate(struct trie *t, struct tnode *tn);
+static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
-static struct tnode *halve(struct trie *t, struct tnode *tn);
+static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
 static void tnode_free(struct tnode *tn);
 static void trie_dump_seq(struct seq_file *seq, struct trie *t);
 extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
@@ -341,8 +342,10 @@ static struct leaf *leaf_new(void)
 static struct leaf_info *leaf_info_new(int plen)
 {
        struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
-        li->plen = plen;
+        if(li) {
-        INIT_LIST_HEAD(&li->falh);
+                li->plen = plen;
+                INIT_LIST_HEAD(&li->falh);
+        }
        return li;
 }
@@ -356,11 +359,32 @@ static inline void free_leaf_info(struct leaf_info *li)
        kfree(li);
 }
+static struct tnode *tnode_alloc(unsigned int size)
+{
+        if (size <= PAGE_SIZE) {
+                return kmalloc(size, GFP_KERNEL);
+        } else {
+                return (struct tnode *)
+                       __get_free_pages(GFP_KERNEL, get_order(size));
+        }
+}
+static void __tnode_free(struct tnode *tn)
+{
+        unsigned int size = sizeof(struct tnode) +
+                            (1<<tn->bits) * sizeof(struct node *);
+        if (size <= PAGE_SIZE)
+                kfree(tn);
+        else
+                free_pages((unsigned long)tn, get_order(size));
+}
 static struct tnode* tnode_new(t_key key, int pos, int bits)
 {
        int nchildren = 1<<bits;
        int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
-        struct tnode *tn = kmalloc(sz,  GFP_KERNEL);
+        struct tnode *tn = tnode_alloc(sz);
        if(tn)  {
                memset(tn, 0, sz);
@@ -388,7 +412,7 @@ static void tnode_free(struct tnode *tn)
                        printk("FL %p \n", tn);
        }
        else if(IS_TNODE(tn)) { 
-                kfree(tn);
+                __tnode_free(tn);
                if(trie_debug > 0 ) 
                        printk("FT %p \n", tn);
        }
@@ -458,6 +482,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w
 static struct node *resize(struct trie *t, struct tnode *tn) 
 {
        int i;
+        int err = 0;
        if (!tn)
                return NULL;
@@ -554,12 +579,20 @@ static struct node *resize(struct trie *t, struct tnode *tn)
         */
        check_tnode(tn);
+        
+        err = 0;
        while ((tn->full_children > 0 &&
               50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
                                inflate_threshold * tnode_child_length(tn))) {
-                tn = inflate(t, tn);
+                tn = inflate(t, tn, &err);
+                if(err) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+                        t->stats.resize_node_skipped++;
+#endif
+                        break;
+                }
        }
        check_tnode(tn);
@@ -568,11 +601,22 @@ static struct node *resize(struct trie *t, struct tnode *tn)
         * Halve as long as the number of empty children in this
         * node is above threshold.
         */
+        err = 0;
        while (tn->bits > 1 &&
               100 * (tnode_child_length(tn) - tn->empty_children) <
-               halve_threshold * tnode_child_length(tn))
+               halve_threshold * tnode_child_length(tn)) {
+                tn = halve(t, tn, &err);
+                if(err) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+                        t->stats.resize_node_skipped++;
+#endif
+                        break;
+                }
+        }
-                tn = halve(t, tn);
  
        /* Only one child remains */
@@ -597,7 +641,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
        return (struct node *) tn;
 }
-static struct tnode *inflate(struct trie *t, struct tnode *tn)
+static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
 {
        struct tnode *inode;
        struct tnode *oldtnode = tn;
@@ -609,8 +653,63 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
        tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
-        if (!tn)
+        if (!tn) {
-                trie_bug("tnode_new failed");
+                *err = -ENOMEM;
+                return oldtnode;
+        }
+        /*
+         * Preallocate and store tnodes before the actual work so we 
+         * don't get into an inconsistent state if memory allocation 
+         * fails. In case of failure we return the oldnode and  inflate 
+         * of tnode is ignored.
+         */
+                        
+        for(i = 0; i < olen; i++) {
+                struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
+                if (inode &&
+                    IS_TNODE(inode) &&
+                    inode->pos == oldtnode->pos + oldtnode->bits &&
+                    inode->bits > 1) {
+                        struct tnode *left, *right;
+                        t_key m = TKEY_GET_MASK(inode->pos, 1);
+ 
+                        left = tnode_new(inode->key&(~m), inode->pos + 1,
+                                         inode->bits - 1);
+                        if(!left) {
+                                *err = -ENOMEM; 
+                                break;
+                        }
+                        
+                        right = tnode_new(inode->key|m, inode->pos + 1,
+                                          inode->bits - 1);
+                        if(!right) {
+                                *err = -ENOMEM; 
+                                break;
+                        }
+                        put_child(t, tn, 2*i, (struct node *) left);
+                        put_child(t, tn, 2*i+1, (struct node *) right);
+                }
+        }
+        if(*err) {
+                int size = tnode_child_length(tn);
+                int j;
+                for(j = 0; j < size; j++) 
+                        if( tn->child[j])
+                                tnode_free((struct tnode *)tn->child[j]);
+                tnode_free(tn);
+                
+                *err = -ENOMEM;
+                return oldtnode;
+        }
        for(i = 0; i < olen; i++) {
                struct node *node = tnode_get_child(oldtnode, i);
@@ -623,7 +722,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
                if(IS_LEAF(node) || ((struct tnode *) node)->pos >
                   tn->pos + tn->bits - 1) {
-                        if(tkey_extract_bits(node->key, tn->pos + tn->bits - 1,
+                        if(tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
                                             1) == 0)
                                put_child(t, tn, 2*i, node);
                        else
@@ -663,27 +762,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
                         * the position (inode->pos)
                         */
-                        t_key m = TKEY_GET_MASK(inode->pos, 1);
- 
                        /* Use the old key, but set the new significant 
                         *   bit to zero. 
                         */
-                        left = tnode_new(inode->key&(~m), inode->pos + 1,
-                                         inode->bits - 1);
-                        if(!left) 
+                        left = (struct tnode *) tnode_get_child(tn, 2*i);
-                                trie_bug("tnode_new failed");
+                        put_child(t, tn, 2*i, NULL);
-                        
-                        
+                        if(!left)
-                        /* Use the old key, but set the new significant 
+                                BUG();
-                         * bit to one. 
-                         */
+                        right = (struct tnode *) tnode_get_child(tn, 2*i+1);
-                        right = tnode_new(inode->key|m, inode->pos + 1,
+                        put_child(t, tn, 2*i+1, NULL);
-                                          inode->bits - 1);
+                        if(!right)
+                                BUG();
-                        if(!right) 
-                                trie_bug("tnode_new failed");
-                        
                        size = tnode_child_length(left);
                        for(j = 0; j < size; j++) {
                                put_child(t, left, j, inode->child[j]);
@@ -699,7 +793,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
        return tn;
 }
-static struct tnode *halve(struct trie *t, struct tnode *tn)
+static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
 {
        struct tnode *oldtnode = tn;
        struct node *left, *right;
@@ -710,8 +804,48 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
  
        tn=tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
-        if(!tn) 
+        if (!tn) {
-                trie_bug("tnode_new failed");
+                *err = -ENOMEM;
+                return oldtnode;
+        }
+        /*
+         * Preallocate and store tnodes before the actual work so we 
+         * don't get into an inconsistent state if memory allocation 
+         * fails. In case of failure we return the oldnode and halve 
+         * of tnode is ignored.
+         */
+        for(i = 0; i < olen; i += 2) {
+                left = tnode_get_child(oldtnode, i);
+                right = tnode_get_child(oldtnode, i+1);
+    
+                /* Two nonempty children */
+                if( left && right)  {
+                        struct tnode *newBinNode =
+                                tnode_new(left->key, tn->pos + tn->bits, 1);
+                        if(!newBinNode) {
+                                *err = -ENOMEM; 
+                                break;
+                        }
+                        put_child(t, tn, i/2, (struct node *)newBinNode);
+                }
+        }
+        if(*err) {
+                int size = tnode_child_length(tn);
+                int j;
+                for(j = 0; j < size; j++) 
+                        if( tn->child[j])
+                                tnode_free((struct tnode *)tn->child[j]);
+                tnode_free(tn);
+                
+                *err = -ENOMEM;
+                return oldtnode;
+        }
        for(i = 0; i < olen; i += 2) {
                left = tnode_get_child(oldtnode, i);
@@ -728,10 +862,11 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
                /* Two nonempty children */
                else {
                        struct tnode *newBinNode =
-                                tnode_new(left->key, tn->pos + tn->bits, 1);
+                                (struct tnode *) tnode_get_child(tn, i/2);
+                        put_child(t, tn, i/2, NULL);
                        if(!newBinNode) 
-                                trie_bug("tnode_new failed");
+                                BUG();
                        put_child(t, newBinNode, 0, left);
                        put_child(t, newBinNode, 1, right);
@@ -879,8 +1014,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
        return (struct node*) tn;
 }
-static struct list_head *
+static  struct list_head *
-fib_insert_node(struct trie *t, u32 key, int plen)
+fib_insert_node(struct trie *t, int *err, u32 key, int plen)
 {
        int pos, newpos;
        struct tnode *tp = NULL, *tn = NULL;
@@ -940,7 +1075,6 @@ fib_insert_node(struct trie *t, u32 key, int plen)
        if(tp && IS_LEAF(tp))
                BUG();
-        t->revision++;
        /* Case 1: n is a leaf. Compare prefixes */
@@ -949,8 +1083,10 @@ fib_insert_node(struct trie *t, u32 key, int plen)
                
                li = leaf_info_new(plen);
                
-                if(! li) 
+                if(! li) {
-                        BUG();
+                        *err = -ENOMEM;
+                        goto err;
+                }
                fa_head = &li->falh;
                insert_leaf_info(&l->list, li);
@@ -959,14 +1095,19 @@ fib_insert_node(struct trie *t, u32 key, int plen)
        t->size++;
        l = leaf_new();
-        if(! l) 
+        if(! l) {
-                BUG();
+                *err = -ENOMEM;
+                goto err;
+        }
        l->key = key;
        li = leaf_info_new(plen);
-        if(! li) 
+        if(! li) {
-                BUG();
+                tnode_free((struct tnode *) l);
+                *err = -ENOMEM;
+                goto err;
+        }
        fa_head = &li->falh;
        insert_leaf_info(&l->list, li);
@@ -1003,9 +1144,14 @@ fib_insert_node(struct trie *t, u32 key, int plen)
                        newpos = 0;
                        tn = tnode_new(key, newpos, 1); /* First tnode */ 
                }
-                if(!tn) 
-                        trie_bug("tnode_pfx_new failed");
+                if(!tn) {
+                        free_leaf_info(li);
+                        tnode_free((struct tnode *) l);
+                        *err = -ENOMEM;
+                        goto err;
+                }                       
+                        
                NODE_SET_PARENT(tn, tp);
                missbit=tkey_extract_bits(key, newpos, 1);
@@ -1027,7 +1173,9 @@ fib_insert_node(struct trie *t, u32 key, int plen)
        }
        /* Rebalance the trie */
        t->trie = trie_rebalance(t, tp);
-done:;
+done:
+        t->revision++;
+err:;
        return fa_head;
 }
@@ -1156,8 +1304,12 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
         * Insert new entry to the list.
         */
-        if(!fa_head)
+        if(!fa_head) {
-                fa_head = fib_insert_node(t, key, plen);
+                fa_head = fib_insert_node(t, &err, key, plen);
+                err = 0;
+                if(err) 
+                        goto out_free_new_fa;
+        }
        write_lock_bh(&fib_lock);
@@ -1170,6 +1322,9 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
        rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
 succeeded:
        return 0;
+out_free_new_fa:
+        kmem_cache_free(fn_alias_kmem, new_fa);
 out:
        fib_release_info(fi);
 err:;   
@@ -2279,6 +2434,7 @@ static void collect_and_show(struct trie *t, struct seq_file *seq)
        seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
        seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
        seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
+        seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
 #ifdef CLEAR_STATS
        memset(&(t->stats), 0, sizeof(t->stats));
 #endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index cb759484979d..279f57abfecb 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -970,7 +970,8 @@ int icmp_rcv(struct sk_buff *skb)
                 *      RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
                 *        discarded if to broadcast/multicast.
                 */
-                if (icmph->type == ICMP_ECHO &&
+                if ((icmph->type == ICMP_ECHO ||
+                     icmph->type == ICMP_TIMESTAMP) &&
                    sysctl_icmp_echo_ignore_broadcasts) {
                        goto error;
                }
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 1f3183168a90..5088f90835ae 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1615,9 +1615,10 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
 {
        int err;
        u32 addr = imr->imr_multiaddr.s_addr;
-        struct ip_mc_socklist *iml, *i;
+        struct ip_mc_socklist *iml=NULL, *i;
        struct in_device *in_dev;
        struct inet_sock *inet = inet_sk(sk);
+        int ifindex;
        int count = 0;
        if (!MULTICAST(addr))
@@ -1633,37 +1634,30 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
                goto done;
        }
-        iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
        err = -EADDRINUSE;
+        ifindex = imr->imr_ifindex;
        for (i = inet->mc_list; i; i = i->next) {
-                if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) {
+                if (i->multi.imr_multiaddr.s_addr == addr &&
-                        /* New style additions are reference counted */
+                    i->multi.imr_ifindex == ifindex)
-                        if (imr->imr_address.s_addr == 0) {
-                                i->count++;
-                                err = 0;
-                        }
                        goto done;
-                }
                count++;
        }
        err = -ENOBUFS;
-        if (iml == NULL || count >= sysctl_igmp_max_memberships)
+        if (count >= sysctl_igmp_max_memberships)
+                goto done;
+        iml = (struct ip_mc_socklist *)sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL);
+        if (iml == NULL)
                goto done;
        memcpy(&iml->multi, imr, sizeof(*imr));
        iml->next = inet->mc_list;
-        iml->count = 1;
        iml->sflist = NULL;
        iml->sfmode = MCAST_EXCLUDE;
        inet->mc_list = iml;
        ip_mc_inc_group(in_dev, addr);
-        iml = NULL;
        err = 0;
 done:
        rtnl_shunlock();
-        if (iml)
-                sock_kfree_s(sk, iml, sizeof(*iml));
        return err;
 }
@@ -1693,30 +1687,25 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
 {
        struct inet_sock *inet = inet_sk(sk);
        struct ip_mc_socklist *iml, **imlp;
+        struct in_device *in_dev;
+        u32 group = imr->imr_multiaddr.s_addr;
+        u32 ifindex;
        rtnl_lock();
+        in_dev = ip_mc_find_dev(imr);
+        if (!in_dev) {
+                rtnl_unlock();
+                return -ENODEV;
+        }
+        ifindex = imr->imr_ifindex;
        for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
-                if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr &&
+                if (iml->multi.imr_multiaddr.s_addr == group &&
-                    iml->multi.imr_address.s_addr==imr->imr_address.s_addr &&
+                    iml->multi.imr_ifindex == ifindex) {
-                    (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) {
+                        (void) ip_mc_leave_src(sk, iml, in_dev);
-                        struct in_device *in_dev;
-                        in_dev = inetdev_by_index(iml->multi.imr_ifindex);
-                        if (in_dev)
-                                (void) ip_mc_leave_src(sk, iml, in_dev);
-                        if (--iml->count) {
-                                rtnl_unlock();
-                                if (in_dev)
-                                        in_dev_put(in_dev);
-                                return 0;
-                        }
                        *imlp = iml->next;
-                        if (in_dev) {
+                        ip_mc_dec_group(in_dev, group);
-                                ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
-                                in_dev_put(in_dev);
-                        }
                        rtnl_unlock();
                        sock_kfree_s(sk, iml, sizeof(*iml));
                        return 0;
@@ -1736,6 +1725,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
        struct in_device *in_dev = NULL;
        struct inet_sock *inet = inet_sk(sk);
        struct ip_sf_socklist *psl;
+        int leavegroup = 0;
        int i, j, rv;
        if (!MULTICAST(addr))
@@ -1755,15 +1745,20 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
        err = -EADDRNOTAVAIL;
        for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
-                if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0)
+                if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr
+                    && pmc->multi.imr_ifindex == imr.imr_ifindex)
                        break;
        }
-        if (!pmc)               /* must have a prior join */
+        if (!pmc) {             /* must have a prior join */
+                err = -EINVAL;
                goto done;
+        }
        /* if a source filter was set, must be the same mode as before */
        if (pmc->sflist) {
-                if (pmc->sfmode != omode)
+                if (pmc->sfmode != omode) {
+                        err = -EINVAL;
                        goto done;
+                }
        } else if (pmc->sfmode != omode) {
                /* allow mode switches for empty-set filters */
                ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
@@ -1775,7 +1770,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
        psl = pmc->sflist;
        if (!add) {
                if (!psl)
-                        goto done;
+                        goto done;      /* err = -EADDRNOTAVAIL */
                rv = !0;
                for (i=0; i<psl->sl_count; i++) {
                        rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
@@ -1784,7 +1779,13 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
                                break;
                }
                if (rv)         /* source not found */
+                        goto done;      /* err = -EADDRNOTAVAIL */
+                /* special case - (INCLUDE, empty) == LEAVE_GROUP */
+                if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
+                        leavegroup = 1;
                        goto done;
+                }
                /* update the interface filter */
                ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, 
@@ -1842,18 +1843,21 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
                &mreqs->imr_sourceaddr, 1);
 done:
        rtnl_shunlock();
+        if (leavegroup)
+                return ip_mc_leave_group(sk, &imr);
        return err;
 }
 int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
 {
-        int err;
+        int err = 0;
        struct ip_mreqn imr;
        u32 addr = msf->imsf_multiaddr;
        struct ip_mc_socklist *pmc;
        struct in_device *in_dev;
        struct inet_sock *inet = inet_sk(sk);
        struct ip_sf_socklist *newpsl, *psl;
+        int leavegroup = 0;
        if (!MULTICAST(addr))
                return -EINVAL;
@@ -1872,15 +1876,22 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
                err = -ENODEV;
                goto done;
        }
-        err = -EADDRNOTAVAIL;
+        /* special case - (INCLUDE, empty) == LEAVE_GROUP */
+        if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
+                leavegroup = 1;
+                goto done;
+        }
        for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
                if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
                    pmc->multi.imr_ifindex == imr.imr_ifindex)
                        break;
        }
-        if (!pmc)               /* must have a prior join */
+        if (!pmc) {             /* must have a prior join */
+                err = -EINVAL;
                goto done;
+        }
        if (msf->imsf_numsrc) {
                newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
                                IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL);
@@ -1909,8 +1920,11 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
                        0, NULL, 0);
        pmc->sflist = newpsl;
        pmc->sfmode = msf->imsf_fmode;
+        err = 0;
 done:
        rtnl_shunlock();
+        if (leavegroup)
+                err = ip_mc_leave_group(sk, &imr);
        return err;
 }
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index af2ec88bbb2f..c703528e0bcd 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -283,14 +283,18 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
 {
        struct net_device *dev = skb->dev;
        struct iphdr *iph = skb->nh.iph;
+        int err;
        /*
         *      Initialise the virtual path cache for the packet. It describes
         *      how the packet travels inside Linux networking.
         */ 
        if (skb->dst == NULL) {
-                if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
+                if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
+                        if (err == -EHOSTUNREACH)
+                                IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
                        goto drop; 
+                }
        }
 #ifdef CONFIG_NET_CLS_ROUTE
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ee07aec215a0..80d13103b2b0 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -107,7 +107,6 @@ static int ip_dev_loopback_xmit(struct sk_buff *newskb)
        newskb->pkt_type = PACKET_LOOPBACK;
        newskb->ip_summed = CHECKSUM_UNNECESSARY;
        BUG_TRAP(newskb->dst);
-        nf_reset(newskb);
        netif_rx(newskb);
        return 0;
 }
@@ -188,8 +187,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
                skb = skb2;
        }
-        nf_reset(skb);
        if (hh) {
                int hh_alen;
@@ -383,7 +380,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
        to->pkt_type = from->pkt_type;
        to->priority = from->priority;
        to->protocol = from->protocol;
-        to->security = from->security;
        dst_release(to->dst);
        to->dst = dst_clone(from->dst);
        to->dev = from->dev;
@@ -1323,23 +1319,8 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
        ip_rt_put(rt);
 }
-/*
- *      IP protocol layer initialiser
- */
-static struct packet_type ip_packet_type = {
-        .type = __constant_htons(ETH_P_IP),
-        .func = ip_rcv,
-};
-/*
- *      IP registers the packet type and then calls the subprotocol initialisers
- */
 void __init ip_init(void)
 {
-        dev_add_pack(&ip_packet_type);
        ip_rt_init();
        inet_initpeers();
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index f8b172f89811..fc7c481d0d79 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -677,11 +677,11 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
                                mreq.imr_address.s_addr = mreqs.imr_interface;
                                mreq.imr_ifindex = 0;
                                err = ip_mc_join_group(sk, &mreq);
-                                if (err)
+                                if (err && err != -EADDRINUSE)
                                        break;
                                omode = MCAST_INCLUDE;
                                add = 1;
-                        } else /*IP_DROP_SOURCE_MEMBERSHIP */ {
+                        } else /* IP_DROP_SOURCE_MEMBERSHIP */ {
                                omode = MCAST_INCLUDE;
                                add = 0;
                        }
@@ -754,7 +754,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
                                mreq.imr_address.s_addr = 0;
                                mreq.imr_ifindex = greqs.gsr_interface;
                                err = ip_mc_join_group(sk, &mreq);
-                                if (err)
+                                if (err && err != -EADDRINUSE)
                                        break;
                                greqs.gsr_interface = mreq.imr_ifindex;
                                omode = MCAST_INCLUDE;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f2509034ce72..d2bf8e1930a3 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1149,8 +1149,10 @@ static int __init ic_dynamic(void)
                ic_rarp_cleanup();
 #endif
-        if (!ic_got_reply)
+        if (!ic_got_reply) {
+                ic_myaddr = INADDR_NONE;
                return -1;
+        }
        printk("IP-Config: Got %s answer from %u.%u.%u.%u, ",
                ((ic_got_reply & IC_RARP) ? "RARP" 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e4f809a93f47..7833d920bdba 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -297,6 +297,7 @@ static int vif_delete(int vifi)
 static void ipmr_destroy_unres(struct mfc_cache *c)
 {
        struct sk_buff *skb;
+        struct nlmsgerr *e;
        atomic_dec(&cache_resolve_queue_len);
@@ -306,7 +307,9 @@ static void ipmr_destroy_unres(struct mfc_cache *c)
                        nlh->nlmsg_type = NLMSG_ERROR;
                        nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
                        skb_trim(skb, nlh->nlmsg_len);
-                        ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
+                        e = NLMSG_DATA(nlh);
+                        e->error = -ETIMEDOUT;
+                        memset(&e->msg, 0, sizeof(e->msg));
                        netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
                } else
                        kfree_skb(skb);
@@ -499,6 +502,7 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void)
 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
 {
        struct sk_buff *skb;
+        struct nlmsgerr *e;
        /*
         *      Play the pending entries through our router
@@ -515,7 +519,9 @@ static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
                                nlh->nlmsg_type = NLMSG_ERROR;
                                nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
                                skb_trim(skb, nlh->nlmsg_len);
-                                ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE;
+                                e = NLMSG_DATA(nlh);
+                                e->error = -EMSGSIZE;
+                                memset(&e->msg, 0, sizeof(e->msg));
                        }
                        err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
                } else
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
index 63a82b4b64bb..c9820bfc493a 100644
--- a/net/ipv4/ipvs/Kconfig
+++ b/net/ipv4/ipvs/Kconfig
@@ -2,11 +2,11 @@
 # IP Virtual Server configuration
 #
 menu    "IP: Virtual Server Configuration"
-        depends on INET && NETFILTER
+        depends on NETFILTER
 config  IP_VS
        tristate "IP virtual server support (EXPERIMENTAL)"
-        depends on INET && NETFILTER
+        depends on NETFILTER
        ---help---
          IP Virtual Server support will let you build a high-performance
          virtual server based on cluster of two or more real servers. This
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
index fd6feb5499fe..d0145a8b1551 100644
--- a/net/ipv4/ipvs/ip_vs_conn.c
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -548,7 +548,6 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
 {
        if (del_timer(&cp->timer))
                mod_timer(&cp->timer, jiffies);
-        __ip_vs_conn_put(cp);
 }
@@ -759,12 +758,11 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
        return 1;
 }
+/* Called from keventd and must protect itself from softirqs */
 void ip_vs_random_dropentry(void)
 {
        int idx;
        struct ip_vs_conn *cp;
-        struct ip_vs_conn *ct;
        /*
         * Randomly scan 1/32 of the whole table every second
@@ -775,7 +773,7 @@ void ip_vs_random_dropentry(void)
                /*
                 *  Lock is actually needed in this loop.
                 */
-                ct_write_lock(hash);
+                ct_write_lock_bh(hash);
                list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
                        if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
@@ -801,23 +799,14 @@ void ip_vs_random_dropentry(void)
                                        continue;
                        }
-                        /*
-                         * Drop the entry, and drop its ct if not referenced
-                         */
-                        atomic_inc(&cp->refcnt);
-                        ct_write_unlock(hash);
-                        if ((ct = cp->control))
-                                atomic_inc(&ct->refcnt);
                        IP_VS_DBG(4, "del connection\n");
                        ip_vs_conn_expire_now(cp);
-                        if (ct) {
+                        if (cp->control) {
                                IP_VS_DBG(4, "del conn template\n");
-                                ip_vs_conn_expire_now(ct);
+                                ip_vs_conn_expire_now(cp->control);
                        }
-                        ct_write_lock(hash);
                }
-                ct_write_unlock(hash);
+                ct_write_unlock_bh(hash);
        }
 }
@@ -829,7 +818,6 @@ static void ip_vs_conn_flush(void)
 {
        int idx;
        struct ip_vs_conn *cp;
-        struct ip_vs_conn *ct;
  flush_again:
        for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
@@ -839,18 +827,13 @@ static void ip_vs_conn_flush(void)
                ct_write_lock_bh(idx);
                list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
-                        atomic_inc(&cp->refcnt);
-                        ct_write_unlock(idx);
-                        if ((ct = cp->control))
-                                atomic_inc(&ct->refcnt);
                        IP_VS_DBG(4, "del connection\n");
                        ip_vs_conn_expire_now(cp);
-                        if (ct) {
+                        if (cp->control) {
                                IP_VS_DBG(4, "del conn template\n");
-                                ip_vs_conn_expire_now(ct);
+                                ip_vs_conn_expire_now(cp->control);
                        }
-                        ct_write_lock(idx);
                }
                ct_write_unlock_bh(idx);
        }
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 218d9701036e..7d99ede2ef79 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -90,7 +90,8 @@ int ip_vs_get_debug_level(void)
 #endif
 /*
- *      update_defense_level is called from keventd and from sysctl.
+ *      update_defense_level is called from keventd and from sysctl,
+ *      so it needs to protect itself from softirqs
 */
 static void update_defense_level(void)
 {
@@ -110,6 +111,8 @@ static void update_defense_level(void)
        nomem = (availmem < sysctl_ip_vs_amemthresh);
+        local_bh_disable();
        /* drop_entry */
        spin_lock(&__ip_vs_dropentry_lock);
        switch (sysctl_ip_vs_drop_entry) {
@@ -206,6 +209,8 @@ static void update_defense_level(void)
        if (to_change >= 0)
                ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
        write_unlock(&__ip_vs_securetcp_lock);
+        local_bh_enable();
 }
@@ -1360,9 +1365,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
                        /* Restore the correct value */
                        *valp = val;
                } else {
-                        local_bh_disable();
                        update_defense_level();
-                        local_bh_enable();
                }
        }
        return rc;
@@ -2059,7 +2062,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
        dst->addr = src->addr;
        dst->port = src->port;
        dst->fwmark = src->fwmark;
-        strcpy(dst->sched_name, src->scheduler->name);
+        strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
        dst->flags = src->flags;
        dst->timeout = src->timeout / HZ;
        dst->netmask = src->netmask;
@@ -2080,6 +2083,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
                list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
                        if (count >= get->num_services)
                                goto out;
+                        memset(&entry, 0, sizeof(entry));
                        ip_vs_copy_service(&entry, svc);
                        if (copy_to_user(&uptr->entrytable[count],
                                         &entry, sizeof(entry))) {
@@ -2094,6 +2098,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
                list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
                        if (count >= get->num_services)
                                goto out;
+                        memset(&entry, 0, sizeof(entry));
                        ip_vs_copy_service(&entry, svc);
                        if (copy_to_user(&uptr->entrytable[count],
                                         &entry, sizeof(entry))) {
@@ -2304,12 +2309,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
                memset(&d, 0, sizeof(d));
                if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
                        d[0].state = IP_VS_STATE_MASTER;
-                        strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
+                        strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn));
                        d[0].syncid = ip_vs_master_syncid;
                }
                if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
                        d[1].state = IP_VS_STATE_BACKUP;
-                        strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
+                        strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn));
                        d[1].syncid = ip_vs_backup_syncid;
                }
                if (copy_to_user(user, &d, sizeof(d)) != 0)
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
index 25c479550a32..574d1f509b46 100644
--- a/net/ipv4/ipvs/ip_vs_sync.c
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -839,10 +839,10 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
        ip_vs_sync_state |= state;
        if (state == IP_VS_STATE_MASTER) {
-                strcpy(ip_vs_master_mcast_ifn, mcast_ifn);
+                strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, sizeof(ip_vs_master_mcast_ifn));
                ip_vs_master_syncid = syncid;
        } else {
-                strcpy(ip_vs_backup_mcast_ifn, mcast_ifn);
+                strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, sizeof(ip_vs_backup_mcast_ifn));
                ip_vs_backup_syncid = syncid;
        }
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 42dc95102873..1dd824f3cf0a 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -432,6 +432,13 @@ static unsigned int ip_conntrack_defrag(unsigned int hooknum,
                                        const struct net_device *out,
                                        int (*okfn)(struct sk_buff *))
 {
+#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
+        /* Previously seen (loopback)?  Ignore.  Do this before
+           fragment check. */
+        if ((*pskb)->nfct)
+                return NF_ACCEPT;
+#endif
        /* Gather fragments. */
        if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
                *pskb = ip_ct_gather_frags(*pskb,
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 9cde8c61f525..6706d3a1bc4f 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -30,7 +30,7 @@
 #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
 #include <linux/netfilter_ipv4/ip_conntrack.h>
-#define CLUSTERIP_VERSION "0.6"
+#define CLUSTERIP_VERSION "0.7"
 #define DEBUG_CLUSTERIP
@@ -524,8 +524,9 @@ arp_mangle(unsigned int hook,
            || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
                return NF_ACCEPT;
-        /* we only want to mangle arp replies */
+        /* we only want to mangle arp requests and replies */
-        if (arp->ar_op != htons(ARPOP_REPLY))
+        if (arp->ar_op != htons(ARPOP_REPLY)
+            && arp->ar_op != htons(ARPOP_REQUEST))
                return NF_ACCEPT;
        payload = (void *)(arp+1);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 80cf633d9f4a..d675ff80b04d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -54,6 +54,7 @@
 *              Marc Boucher    :       routing by fwmark
 *      Robert Olsson           :       Added rt_cache statistics
 *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
+ *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
 *
 *              This program is free software; you can redistribute it and/or
 *              modify it under the terms of the GNU General Public License
@@ -70,6 +71,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/bootmem.h>
 #include <linux/string.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
@@ -201,8 +203,37 @@ __u8 ip_tos2prio[16] = {
 struct rt_hash_bucket {
        struct rtable   *chain;
-        spinlock_t      lock;
+};
-} __attribute__((__aligned__(8)));
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+/*
+ * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
+ * The size of this table is a power of two and depends on the number of CPUS.
+ */
+#if NR_CPUS >= 32
+#define RT_HASH_LOCK_SZ 4096
+#elif NR_CPUS >= 16
+#define RT_HASH_LOCK_SZ 2048
+#elif NR_CPUS >= 8
+#define RT_HASH_LOCK_SZ 1024
+#elif NR_CPUS >= 4
+#define RT_HASH_LOCK_SZ 512
+#else
+#define RT_HASH_LOCK_SZ 256
+#endif
+static spinlock_t       *rt_hash_locks;
+# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
+# define rt_hash_lock_init()    { \
+                int i; \
+                rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
+                if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
+                for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
+                        spin_lock_init(&rt_hash_locks[i]); \
+                }
+#else
+# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_init()
+#endif
 static struct rt_hash_bucket    *rt_hash_table;
 static unsigned                 rt_hash_mask;
@@ -575,19 +606,26 @@ static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
 /* This runs via a timer and thus is always in BH context. */
 static void rt_check_expire(unsigned long dummy)
 {
-        static int rover;
+        static unsigned int rover;
-        int i = rover, t;
+        unsigned int i = rover, goal;
        struct rtable *rth, **rthp;
        unsigned long now = jiffies;
+        u64 mult;
-        for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
-             t -= ip_rt_gc_timeout) {
+        mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
+        if (ip_rt_gc_timeout > 1)
+                do_div(mult, ip_rt_gc_timeout);
+        goal = (unsigned int)mult;
+        if (goal > rt_hash_mask) goal = rt_hash_mask + 1;
+        for (; goal > 0; goal--) {
                unsigned long tmo = ip_rt_gc_timeout;
                i = (i + 1) & rt_hash_mask;
                rthp = &rt_hash_table[i].chain;
-                spin_lock(&rt_hash_table[i].lock);
+                if (*rthp == 0)
+                        continue;
+                spin_lock(rt_hash_lock_addr(i));
                while ((rth = *rthp) != NULL) {
                        if (rth->u.dst.expires) {
                                /* Entry is expired even if it is in use */
@@ -620,14 +658,14 @@ static void rt_check_expire(unsigned long dummy)
                        rt_free(rth);
 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
                }
-                spin_unlock(&rt_hash_table[i].lock);
+                spin_unlock(rt_hash_lock_addr(i));
                /* Fallback loop breaker. */
                if (time_after(jiffies, now))
                        break;
        }
        rover = i;
-        mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
+        mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
 }
 /* This can run from both BH and non-BH contexts, the latter
@@ -643,11 +681,11 @@ static void rt_run_flush(unsigned long dummy)
        get_random_bytes(&rt_hash_rnd, 4);
        for (i = rt_hash_mask; i >= 0; i--) {
-                spin_lock_bh(&rt_hash_table[i].lock);
+                spin_lock_bh(rt_hash_lock_addr(i));
                rth = rt_hash_table[i].chain;
                if (rth)
                        rt_hash_table[i].chain = NULL;
-                spin_unlock_bh(&rt_hash_table[i].lock);
+                spin_unlock_bh(rt_hash_lock_addr(i));
                for (; rth; rth = next) {
                        next = rth->u.rt_next;
@@ -780,7 +818,7 @@ static int rt_garbage_collect(void)
                        k = (k + 1) & rt_hash_mask;
                        rthp = &rt_hash_table[k].chain;
-                        spin_lock_bh(&rt_hash_table[k].lock);
+                        spin_lock_bh(rt_hash_lock_addr(k));
                        while ((rth = *rthp) != NULL) {
                                if (!rt_may_expire(rth, tmo, expire)) {
                                        tmo >>= 1;
@@ -812,7 +850,7 @@ static int rt_garbage_collect(void)
                                goal--;
 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
                        }
-                        spin_unlock_bh(&rt_hash_table[k].lock);
+                        spin_unlock_bh(rt_hash_lock_addr(k));
                        if (goal <= 0)
                                break;
                }
@@ -882,7 +920,7 @@ restart:
        rthp = &rt_hash_table[hash].chain;
-        spin_lock_bh(&rt_hash_table[hash].lock);
+        spin_lock_bh(rt_hash_lock_addr(hash));
        while ((rth = *rthp) != NULL) {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
                if (!(rth->u.dst.flags & DST_BALANCED) &&
@@ -908,7 +946,7 @@ restart:
                        rth->u.dst.__use++;
                        dst_hold(&rth->u.dst);
                        rth->u.dst.lastuse = now;
-                        spin_unlock_bh(&rt_hash_table[hash].lock);
+                        spin_unlock_bh(rt_hash_lock_addr(hash));
                        rt_drop(rt);
                        *rp = rth;
@@ -949,7 +987,7 @@ restart:
        if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
                int err = arp_bind_neighbour(&rt->u.dst);
                if (err) {
-                        spin_unlock_bh(&rt_hash_table[hash].lock);
+                        spin_unlock_bh(rt_hash_lock_addr(hash));
                        if (err != -ENOBUFS) {
                                rt_drop(rt);
@@ -990,7 +1028,7 @@ restart:
        }
 #endif
        rt_hash_table[hash].chain = rt;
-        spin_unlock_bh(&rt_hash_table[hash].lock);
+        spin_unlock_bh(rt_hash_lock_addr(hash));
        *rp = rt;
        return 0;
 }
@@ -1058,7 +1096,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
 {
        struct rtable **rthp;
-        spin_lock_bh(&rt_hash_table[hash].lock);
+        spin_lock_bh(rt_hash_lock_addr(hash));
        ip_rt_put(rt);
        for (rthp = &rt_hash_table[hash].chain; *rthp;
             rthp = &(*rthp)->u.rt_next)
@@ -1067,7 +1105,7 @@ static void rt_del(unsigned hash, struct rtable *rt)
                        rt_free(rt);
                        break;
                }
-        spin_unlock_bh(&rt_hash_table[hash].lock);
+        spin_unlock_bh(rt_hash_lock_addr(hash));
 }
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -1647,7 +1685,7 @@ static void ip_handle_martian_source(struct net_device *dev,
                printk(KERN_WARNING "martian source %u.%u.%u.%u from "
                        "%u.%u.%u.%u, on dev %s\n",
                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
-                if (dev->hard_header_len) {
+                if (dev->hard_header_len && skb->mac.raw) {
                        int i;
                        unsigned char *p = skb->mac.raw;
                        printk(KERN_WARNING "ll header: ");
@@ -1909,7 +1947,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
         */
        if ((err = fib_lookup(&fl, &res)) != 0) {
                if (!IN_DEV_FORWARD(in_dev))
-                        goto e_inval;
+                        goto e_hostunreach;
                goto no_route;
        }
        free_res = 1;
@@ -1933,7 +1971,7 @@ static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
        }
        if (!IN_DEV_FORWARD(in_dev))
-                goto e_inval;
+                goto e_hostunreach;
        if (res.type != RTN_UNICAST)
                goto martian_destination;
@@ -2025,6 +2063,11 @@ martian_destination:
                        "%u.%u.%u.%u, dev %s\n",
                        NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
 #endif
+e_hostunreach:
+        err = -EHOSTUNREACH;
+        goto done;
 e_inval:
        err = -EINVAL;
        goto done;
@@ -3068,12 +3111,14 @@ __setup("rhash_entries=", set_rhash_entries);
 int __init ip_rt_init(void)
 {
-        int i, order, goal, rc = 0;
+        int rc = 0;
        rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
                             (jiffies ^ (jiffies >> 7)));
 #ifdef CONFIG_NET_CLS_ROUTE
+        {
+        int order;
        for (order = 0;
             (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
                /* NOTHING */;
@@ -3081,6 +3126,7 @@ int __init ip_rt_init(void)
        if (!ip_rt_acct)
                panic("IP: failed to allocate ip_rt_acct\n");
        memset(ip_rt_acct, 0, PAGE_SIZE << order);
+        }
 #endif
        ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
@@ -3091,36 +3137,19 @@ int __init ip_rt_init(void)
        if (!ipv4_dst_ops.kmem_cachep)
                panic("IP: failed to allocate ip_dst_cache\n");
-        goal = num_physpages >> (26 - PAGE_SHIFT);
+        rt_hash_table = (struct rt_hash_bucket *)
-        if (rhash_entries)
+                alloc_large_system_hash("IP route cache",
-                goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
+                                        sizeof(struct rt_hash_bucket),
-        for (order = 0; (1UL << order) < goal; order++)
+                                        rhash_entries,
-                /* NOTHING */;
+                                        (num_physpages >= 128 * 1024) ?
+                                                (27 - PAGE_SHIFT) :
-        do {
+                                                (29 - PAGE_SHIFT),
-                rt_hash_mask = (1UL << order) * PAGE_SIZE /
+                                        HASH_HIGHMEM,
-                        sizeof(struct rt_hash_bucket);
+                                        &rt_hash_log,
-                while (rt_hash_mask & (rt_hash_mask - 1))
+                                        &rt_hash_mask,
-                        rt_hash_mask--;
+                                        0);
-                rt_hash_table = (struct rt_hash_bucket *)
+        memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
-                        __get_free_pages(GFP_ATOMIC, order);
+        rt_hash_lock_init();
-        } while (rt_hash_table == NULL && --order > 0);
-        if (!rt_hash_table)
-                panic("Failed to allocate IP route cache hash table\n");
-        printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
-               rt_hash_mask,
-               (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
-        for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
-                /* NOTHING */;
-        rt_hash_mask--;
-        for (i = 0; i <= rt_hash_mask; i++) {
-                spin_lock_init(&rt_hash_table[i].lock);
-                rt_hash_table[i].chain = NULL;
-        }
        ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
        ip_rt_max_size = (rt_hash_mask + 1) * 16;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 23068bddbf0b..e32894532416 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table,
        return 1;
 }
+static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
+                                       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        char val[TCP_CA_NAME_MAX];
+        ctl_table tbl = {
+                .data = val,
+                .maxlen = TCP_CA_NAME_MAX,
+        };
+        int ret;
+        tcp_get_default_congestion_control(val);
+        ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+        if (write && ret == 0)
+                ret = tcp_set_default_congestion_control(val);
+        return ret;
+}
+int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
+                                  void __user *oldval, size_t __user *oldlenp,
+                                  void __user *newval, size_t newlen,
+                                  void **context)
+{
+        char val[TCP_CA_NAME_MAX];
+        ctl_table tbl = {
+                .data = val,
+                .maxlen = TCP_CA_NAME_MAX,
+        };
+        int ret;
+        tcp_get_default_congestion_control(val);
+        ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
+                            context);
+        if (ret == 0 && newval && newlen)
+                ret = tcp_set_default_congestion_control(val);
+        return ret;
+}
 ctl_table ipv4_table[] = {
        {
                .ctl_name       = NET_IPV4_TCP_TIMESTAMPS,
@@ -612,70 +651,6 @@ ctl_table ipv4_table[] = {
                .proc_handler   = &proc_dointvec,
        },
        {
-                .ctl_name       = NET_TCP_WESTWOOD, 
-                .procname       = "tcp_westwood",
-                .data           = &sysctl_tcp_westwood,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
-        {
-                .ctl_name       = NET_TCP_VEGAS,
-                .procname       = "tcp_vegas_cong_avoid",
-                .data           = &sysctl_tcp_vegas_cong_avoid,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
-        {
-                .ctl_name       = NET_TCP_VEGAS_ALPHA,
-                .procname       = "tcp_vegas_alpha",
-                .data           = &sysctl_tcp_vegas_alpha,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
-        {
-                .ctl_name       = NET_TCP_VEGAS_BETA,
-                .procname       = "tcp_vegas_beta",
-                .data           = &sysctl_tcp_vegas_beta,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
-        {
-                .ctl_name       = NET_TCP_VEGAS_GAMMA,
-                .procname       = "tcp_vegas_gamma",
-                .data           = &sysctl_tcp_vegas_gamma,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
-        {
-                .ctl_name       = NET_TCP_BIC,
-                .procname       = "tcp_bic",
-                .data           = &sysctl_tcp_bic,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
-        {
-                .ctl_name       = NET_TCP_BIC_FAST_CONVERGENCE,
-                .procname       = "tcp_bic_fast_convergence",
-                .data           = &sysctl_tcp_bic_fast_convergence,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
-        {
-                .ctl_name       = NET_TCP_BIC_LOW_WINDOW,
-                .procname       = "tcp_bic_low_window",
-                .data           = &sysctl_tcp_bic_low_window,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
-        },
-        {
                .ctl_name       = NET_TCP_MODERATE_RCVBUF,
                .procname       = "tcp_moderate_rcvbuf",
                .data           = &sysctl_tcp_moderate_rcvbuf,
@@ -692,13 +667,14 @@ ctl_table ipv4_table[] = {
                .proc_handler   = &proc_dointvec,
        },
        {
-                .ctl_name       = NET_TCP_BIC_BETA,
+                .ctl_name       = NET_TCP_CONG_CONTROL,
-                .procname       = "tcp_bic_beta",
+                .procname       = "tcp_congestion_control",
-                .data           = &sysctl_tcp_bic_beta,
-                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = &proc_dointvec,
+                .maxlen         = TCP_CA_NAME_MAX,
+                .proc_handler   = &proc_tcp_congestion_control,
+                .strategy       = &sysctl_tcp_congestion_control,
        },
        { .ctl_name = 0 }
 };
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 674bbd8cfd36..ddb6ce4ecff2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
                         size_t psize, int flags)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        int mss_now;
+        int mss_now, size_goal;
        int err;
        ssize_t copied;
        long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+        size_goal = tp->xmit_size_goal;
        copied = 0;
        err = -EPIPE;
@@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
                int offset = poffset % PAGE_SIZE;
                int size = min_t(size_t, psize, PAGE_SIZE - offset);
-                if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
+                if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
 new_segment:
                        if (!sk_stream_memory_free(sk))
                                goto wait_for_sndbuf;
@@ -652,7 +653,7 @@ new_segment:
                                goto wait_for_memory;
                        skb_entail(sk, tp, skb);
-                        copy = mss_now;
+                        copy = size_goal;
                }
                if (copy > size)
@@ -693,7 +694,7 @@ new_segment:
                if (!(psize -= copy))
                        goto out;
-                if (skb->len != mss_now || (flags & MSG_OOB))
+                if (skb->len < mss_now || (flags & MSG_OOB))
                        continue;
                if (forced_push(tp)) {
@@ -713,6 +714,7 @@ wait_for_memory:
                        goto do_error;
                mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+                size_goal = tp->xmit_size_goal;
        }
 out:
@@ -754,15 +756,20 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
 {
-        int tmp = tp->mss_cache_std;
+        int tmp = tp->mss_cache;
        if (sk->sk_route_caps & NETIF_F_SG) {
-                int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+                if (sk->sk_route_caps & NETIF_F_TSO)
+                        tmp = 0;
+                else {
+                        int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
-                if (tmp >= pgbreak &&
+                        if (tmp >= pgbreak &&
-                    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
+                            tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
-                        tmp = pgbreak;
+                                tmp = pgbreak;
+                }
        }
        return tmp;
 }
@@ -773,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        int iovlen, flags;
-        int mss_now;
+        int mss_now, size_goal;
        int err, copied;
        long timeo;
@@ -792,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+        size_goal = tp->xmit_size_goal;
        /* Ok commence sending. */
        iovlen = msg->msg_iovlen;
@@ -814,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                        skb = sk->sk_write_queue.prev;
                        if (!sk->sk_send_head ||
-                            (copy = mss_now - skb->len) <= 0) {
+                            (copy = size_goal - skb->len) <= 0) {
 new_segment:
                                /* Allocate new segment. If the interface is SG,
@@ -837,7 +845,7 @@ new_segment:
                                        skb->ip_summed = CHECKSUM_HW;
                                skb_entail(sk, tp, skb);
-                                copy = mss_now;
+                                copy = size_goal;
                        }
                        /* Try to append data to the end of skb. */
@@ -872,11 +880,6 @@ new_segment:
                                        tcp_mark_push(tp, skb);
                                        goto new_segment;
                                } else if (page) {
-                                        /* If page is cached, align
-                                         * offset to L1 cache boundary
-                                         */
-                                        off = (off + L1_CACHE_BYTES - 1) &
-                                              ~(L1_CACHE_BYTES - 1);
                                        if (off == PAGE_SIZE) {
                                                put_page(page);
                                                TCP_PAGE(sk) = page = NULL;
@@ -937,7 +940,7 @@ new_segment:
                        if ((seglen -= copy) == 0 && iovlen == 0)
                                goto out;
-                        if (skb->len != mss_now || (flags & MSG_OOB))
+                        if (skb->len < mss_now || (flags & MSG_OOB))
                                continue;
                        if (forced_push(tp)) {
@@ -957,6 +960,7 @@ wait_for_memory:
                                goto do_error;
                        mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+                        size_goal = tp->xmit_size_goal;
                }
        }
@@ -1101,7 +1105,7 @@ static void tcp_prequeue_process(struct sock *sk)
        struct sk_buff *skb;
        struct tcp_sock *tp = tcp_sk(sk);
-        NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
+        NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
        /* RX process wants to run with disabled BHs, though it is not
         * necessary */
@@ -1365,7 +1369,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                         * is not empty. It is more elegant, but eats cycles,
                         * unfortunately.
                         */
-                        if (skb_queue_len(&tp->ucopy.prequeue))
+                        if (!skb_queue_empty(&tp->ucopy.prequeue))
                                goto do_prequeue;
                        /* __ Set realtime policy in scheduler __ */
@@ -1390,7 +1394,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                        }
                        if (tp->rcv_nxt == tp->copied_seq &&
-                            skb_queue_len(&tp->ucopy.prequeue)) {
+                            !skb_queue_empty(&tp->ucopy.prequeue)) {
 do_prequeue:
                                tcp_prequeue_process(sk);
@@ -1472,7 +1476,7 @@ skip_copy:
        } while (len > 0);
        if (user_recv) {
-                if (skb_queue_len(&tp->ucopy.prequeue)) {
+                if (!skb_queue_empty(&tp->ucopy.prequeue)) {
                        int chunk;
                        tp->ucopy.len = copied > 0 ? len : 0;
@@ -1927,6 +1931,25 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
                return tp->af_specific->setsockopt(sk, level, optname,
                                                   optval, optlen);
+        /* This is a string value all the others are int's */
+        if (optname == TCP_CONGESTION) {
+                char name[TCP_CA_NAME_MAX];
+                if (optlen < 1)
+                        return -EINVAL;
+                val = strncpy_from_user(name, optval,
+                                        min(TCP_CA_NAME_MAX-1, optlen));
+                if (val < 0)
+                        return -EFAULT;
+                name[val] = 0;
+                lock_sock(sk);
+                err = tcp_set_congestion_control(tp, name);
+                release_sock(sk);
+                return err;
+        }
        if (optlen < sizeof(int))
                return -EINVAL;
@@ -2109,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
        info->tcpi_rto = jiffies_to_usecs(tp->rto);
        info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
-        info->tcpi_snd_mss = tp->mss_cache_std;
+        info->tcpi_snd_mss = tp->mss_cache;
        info->tcpi_rcv_mss = tp->ack.rcv_mss;
        info->tcpi_unacked = tp->packets_out;
@@ -2159,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
        switch (optname) {
        case TCP_MAXSEG:
-                val = tp->mss_cache_std;
+                val = tp->mss_cache;
                if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
                        val = tp->rx_opt.user_mss;
                break;
@@ -2211,6 +2234,16 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
        case TCP_QUICKACK:
                val = !tp->ack.pingpong;
                break;
+        case TCP_CONGESTION:
+                if (get_user(len, optlen))
+                        return -EFAULT;
+                len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
+                if (put_user(len, optlen))
+                        return -EFAULT;
+                if (copy_to_user(optval, tp->ca_ops->name, len))
+                        return -EFAULT;
+                return 0;
        default:
                return -ENOPROTOOPT;
        };
@@ -2224,7 +2257,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 extern void __skb_cb_too_small_for_tcp(int, int);
-extern void tcpdiag_init(void);
+extern struct tcp_congestion_ops tcp_reno;
 static __initdata unsigned long thash_entries;
 static int __init set_thash_entries(char *str)
@@ -2333,6 +2366,8 @@ void __init tcp_init(void)
        printk(KERN_INFO "TCP: Hash tables configured "
               "(established %d bind %d)\n",
               tcp_ehash_size << 1, tcp_bhash_size);
+        tcp_register_congestion_control(&tcp_reno);
 }
 EXPORT_SYMBOL(tcp_accept);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
new file mode 100644
index 000000000000..ec38d45d6649
--- /dev/null
+++ b/net/ipv4/tcp_bic.c
@@ -0,0 +1,331 @@
+/*
+ * Binary Increase Congestion control for TCP
+ *
+ * This is from the implementation of BICTCP in
+ * Lison-Xu, Kahaled Harfoush, and Injong Rhee.
+ *  "Binary Increase Congestion Control for Fast, Long Distance
+ *  Networks" in InfoComm 2004
+ * Available from:
+ *  http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
+ *
+ * Unless BIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+#define BICTCP_BETA_SCALE    1024       /* Scale factor beta calculation
+                                         * max_cwnd = snd_cwnd * beta
+                                         */
+#define BICTCP_B                4        /*
+                                          * In binary search,
+                                          * go to point (max+min)/N
+                                          */
+static int fast_convergence = 1;
+static int max_increment = 32;
+static int low_window = 14;
+static int beta = 819;          /* = 819/1024 (BICTCP_BETA_SCALE) */
+static int low_utilization_threshold = 153;
+static int low_utilization_period = 2;
+static int initial_ssthresh = 100;
+static int smooth_part = 20;
+module_param(fast_convergence, int, 0644);
+MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
+module_param(max_increment, int, 0644);
+MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
+module_param(low_window, int, 0644);
+MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "beta for multiplicative increase");
+module_param(low_utilization_threshold, int, 0644);
+MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode");
+module_param(low_utilization_period, int, 0644);
+MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)");
+module_param(initial_ssthresh, int, 0644);
+MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
+module_param(smooth_part, int, 0644);
+MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
+/* BIC TCP Parameters */
+struct bictcp {
+        u32     cnt;            /* increase cwnd by 1 after ACKs */
+        u32     last_max_cwnd;  /* last maximum snd_cwnd */
+        u32     loss_cwnd;      /* congestion window at last loss */
+        u32     last_cwnd;      /* the last snd_cwnd */
+        u32     last_time;      /* time when updated last_cwnd */
+        u32     delay_min;      /* min delay */
+        u32     delay_max;      /* max delay */
+        u32     last_delay;
+        u8      low_utilization;/* 0: high; 1: low */
+        u32     low_utilization_start;  /* starting time of low utilization detection*/
+        u32     epoch_start;    /* beginning of an epoch */
+#define ACK_RATIO_SHIFT 4
+        u32     delayed_ack;    /* estimate the ratio of Packets/ACKs << 4 */
+};
+static inline void bictcp_reset(struct bictcp *ca)
+{
+        ca->cnt = 0;
+        ca->last_max_cwnd = 0;
+        ca->loss_cwnd = 0;
+        ca->last_cwnd = 0;
+        ca->last_time = 0;
+        ca->delay_min = 0;
+        ca->delay_max = 0;
+        ca->last_delay = 0;
+        ca->low_utilization = 0;
+        ca->low_utilization_start = 0;
+        ca->epoch_start = 0;
+        ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
+}
+static void bictcp_init(struct tcp_sock *tp)
+{
+        bictcp_reset(tcp_ca(tp));
+        if (initial_ssthresh)
+                tp->snd_ssthresh = initial_ssthresh;
+}
+/*
+ * Compute congestion window to use.
+ */
+static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
+{
+        if (ca->last_cwnd == cwnd &&
+            (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
+                return;
+        ca->last_cwnd = cwnd;
+        ca->last_time = tcp_time_stamp;
+        if (ca->epoch_start == 0) /* record the beginning of an epoch */
+                ca->epoch_start = tcp_time_stamp;
+        /* start off normal */
+        if (cwnd <= low_window) {
+                ca->cnt = cwnd;
+                return;
+        }
+        /* binary increase */
+        if (cwnd < ca->last_max_cwnd) {
+                __u32   dist = (ca->last_max_cwnd - cwnd)
+                        / BICTCP_B;
+                if (dist > max_increment)
+                        /* linear increase */
+                        ca->cnt = cwnd / max_increment;
+                else if (dist <= 1U)
+                        /* binary search increase */
+                        ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+                else
+                        /* binary search increase */
+                        ca->cnt = cwnd / dist;
+        } else {
+                /* slow start AMD linear increase */
+                if (cwnd < ca->last_max_cwnd + BICTCP_B)
+                        /* slow start */
+                        ca->cnt = (cwnd * smooth_part) / BICTCP_B;
+                else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
+                        /* slow start */
+                        ca->cnt = (cwnd * (BICTCP_B-1))
+                                / cwnd-ca->last_max_cwnd;
+                else
+                        /* linear increase */
+                        ca->cnt = cwnd / max_increment;
+        }
+        /* if in slow start or link utilization is very low */
+        if ( ca->loss_cwnd == 0 ||
+             (cwnd > ca->loss_cwnd && ca->low_utilization)) {
+                if (ca->cnt > 20) /* increase cwnd 5% per RTT */
+                        ca->cnt = 20;
+        }
+        ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
+        if (ca->cnt == 0)                       /* cannot be zero */
+                ca->cnt = 1;
+}
+/* Detect low utilization in congestion avoidance */
+static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag)
+{
+        struct bictcp *ca = tcp_ca(tp);
+        u32 dist, delay;
+        /* No time stamp */
+        if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) ||
+             /* Discard delay samples right after fast recovery */
+             tcp_time_stamp < ca->epoch_start + HZ ||
+             /* this delay samples may not be accurate */
+             flag == 0) {
+                ca->last_delay = 0;
+                goto notlow;
+        }
+        delay = ca->last_delay<<3;      /* use the same scale as tp->srtt*/
+        ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+        if (delay == 0)                 /* no previous delay sample */
+                goto notlow;
+        /* first time call or link delay decreases */
+        if (ca->delay_min == 0 || ca->delay_min > delay) {
+                ca->delay_min = ca->delay_max = delay;
+                goto notlow;
+        }
+        if (ca->delay_max < delay)
+                ca->delay_max = delay;
+        /* utilization is low, if avg delay < dist*threshold
+           for checking_period time */
+        dist = ca->delay_max - ca->delay_min;
+        if (dist <= ca->delay_min>>6 ||
+            tp->srtt - ca->delay_min >=  (dist*low_utilization_threshold)>>10)
+                goto notlow;
+        if (ca->low_utilization_start == 0) {
+                ca->low_utilization = 0;
+                ca->low_utilization_start = tcp_time_stamp;
+        } else if ((s32)(tcp_time_stamp - ca->low_utilization_start)
+                        > low_utilization_period*HZ) {
+                ca->low_utilization = 1;
+        }
+        return;
+ notlow:
+        ca->low_utilization = 0;
+        ca->low_utilization_start = 0;
+}
+static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack,
+                              u32 seq_rtt, u32 in_flight, int data_acked)
+{
+        struct bictcp *ca = tcp_ca(tp);
+        bictcp_low_utilization(tp, data_acked);
+        if (in_flight < tp->snd_cwnd)
+                return;
+        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+                /* In "safe" area, increase. */
+                if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                        tp->snd_cwnd++;
+        } else {
+                bictcp_update(ca, tp->snd_cwnd);
+                /* In dangerous area, increase slowly.
+                 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+                 */
+                if (tp->snd_cwnd_cnt >= ca->cnt) {
+                        if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                                tp->snd_cwnd++;
+                        tp->snd_cwnd_cnt = 0;
+                } else
+                        tp->snd_cwnd_cnt++;
+        }
+}
+/*
+ *      behave like Reno until low_window is reached,
+ *      then increase congestion window slowly
+ */
+static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp)
+{
+        struct bictcp *ca = tcp_ca(tp);
+        ca->epoch_start = 0;    /* end of epoch */
+        /* in case of wrong delay_max*/
+        if (ca->delay_min > 0 && ca->delay_max > ca->delay_min)
+                ca->delay_max = ca->delay_min
+                        + ((ca->delay_max - ca->delay_min)* 90) / 100;
+        /* Wmax and fast convergence */
+        if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
+                ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+                        / (2 * BICTCP_BETA_SCALE);
+        else
+                ca->last_max_cwnd = tp->snd_cwnd;
+        ca->loss_cwnd = tp->snd_cwnd;
+        if (tp->snd_cwnd <= low_window)
+                return max(tp->snd_cwnd >> 1U, 2U);
+        else
+                return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+}
+static u32 bictcp_undo_cwnd(struct tcp_sock *tp)
+{
+        struct bictcp *ca = tcp_ca(tp);
+        return max(tp->snd_cwnd, ca->last_max_cwnd);
+}
+static u32 bictcp_min_cwnd(struct tcp_sock *tp)
+{
+        return tp->snd_ssthresh;
+}
+static void bictcp_state(struct tcp_sock *tp, u8 new_state)
+{
+        if (new_state == TCP_CA_Loss)
+                bictcp_reset(tcp_ca(tp));
+}
+/* Track delayed acknowledgement ratio using sliding window
+ * ratio = (15*ratio + sample) / 16
+ */
+static void bictcp_acked(struct tcp_sock *tp, u32 cnt)
+{
+        if (cnt > 0 &&  tp->ca_state == TCP_CA_Open) {
+                struct bictcp *ca = tcp_ca(tp);
+                cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
+                ca->delayed_ack += cnt;
+        }
+}
+static struct tcp_congestion_ops bictcp = {
+        .init           = bictcp_init,
+        .ssthresh       = bictcp_recalc_ssthresh,
+        .cong_avoid     = bictcp_cong_avoid,
+        .set_state      = bictcp_state,
+        .undo_cwnd      = bictcp_undo_cwnd,
+        .min_cwnd       = bictcp_min_cwnd,
+        .pkts_acked     = bictcp_acked,
+        .owner          = THIS_MODULE,
+        .name           = "bic",
+};
+static int __init bictcp_register(void)
+{
+        BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE);
+        return tcp_register_congestion_control(&bictcp);
+}
+static void __exit bictcp_unregister(void)
+{
+        tcp_unregister_congestion_control(&bictcp);
+}
+module_init(bictcp_register);
+module_exit(bictcp_unregister);
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("BIC TCP");
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
new file mode 100644
index 000000000000..4970d10a7785
--- /dev/null
+++ b/net/ipv4/tcp_cong.c
@@ -0,0 +1,237 @@
+/*
+ * Plugable TCP congestion control support and newReno
+ * congestion control.
+ * Based on ideas from I/O scheduler suport and Web100.
+ *
+ * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <net/tcp.h>
+static DEFINE_SPINLOCK(tcp_cong_list_lock);
+static LIST_HEAD(tcp_cong_list);
+/* Simple linear search, don't expect many entries! */
+static struct tcp_congestion_ops *tcp_ca_find(const char *name)
+{
+        struct tcp_congestion_ops *e;
+        list_for_each_entry_rcu(e, &tcp_cong_list, list) {
+                if (strcmp(e->name, name) == 0)
+                        return e;
+        }
+        return NULL;
+}
+/*
+ * Attach new congestion control algorthim to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+        int ret = 0;
+        /* all algorithms must implement ssthresh and cong_avoid ops */
+        if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
+                printk(KERN_ERR "TCP %s does not implement required ops\n",
+                       ca->name);
+                return -EINVAL;
+        }
+        spin_lock(&tcp_cong_list_lock);
+        if (tcp_ca_find(ca->name)) {
+                printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
+                ret = -EEXIST;
+        } else {
+                list_add_rcu(&ca->list, &tcp_cong_list);
+                printk(KERN_INFO "TCP %s registered\n", ca->name);
+        }
+        spin_unlock(&tcp_cong_list_lock);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
+/*
+ * Remove congestion control algorithm, called from
+ * the module's remove function.  Module ref counts are used
+ * to ensure that this can't be done till all sockets using
+ * that method are closed.
+ */
+void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
+{
+        spin_lock(&tcp_cong_list_lock);
+        list_del_rcu(&ca->list);
+        spin_unlock(&tcp_cong_list_lock);
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
+/* Assign choice of congestion control. */
+void tcp_init_congestion_control(struct tcp_sock *tp)
+{
+        struct tcp_congestion_ops *ca;
+        if (tp->ca_ops != &tcp_init_congestion_ops)
+                return;
+        rcu_read_lock();
+        list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+                if (try_module_get(ca->owner)) {
+                        tp->ca_ops = ca;
+                        break;
+                }
+        }
+        rcu_read_unlock();
+        if (tp->ca_ops->init)
+                tp->ca_ops->init(tp);
+}
+/* Manage refcounts on socket close. */
+void tcp_cleanup_congestion_control(struct tcp_sock *tp)
+{
+        if (tp->ca_ops->release)
+                tp->ca_ops->release(tp);
+        module_put(tp->ca_ops->owner);
+}
+/* Used by sysctl to change default congestion control */
+int tcp_set_default_congestion_control(const char *name)
+{
+        struct tcp_congestion_ops *ca;
+        int ret = -ENOENT;
+        spin_lock(&tcp_cong_list_lock);
+        ca = tcp_ca_find(name);
+#ifdef CONFIG_KMOD
+        if (!ca) {
+                spin_unlock(&tcp_cong_list_lock);
+                request_module("tcp_%s", name);
+                spin_lock(&tcp_cong_list_lock);
+                ca = tcp_ca_find(name);
+        }
+#endif
+        if (ca) {
+                list_move(&ca->list, &tcp_cong_list);
+                ret = 0;
+        }
+        spin_unlock(&tcp_cong_list_lock);
+        return ret;
+}
+/* Get current default congestion control */
+void tcp_get_default_congestion_control(char *name)
+{
+        struct tcp_congestion_ops *ca;
+        /* We will always have reno... */
+        BUG_ON(list_empty(&tcp_cong_list));
+        rcu_read_lock();
+        ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
+        strncpy(name, ca->name, TCP_CA_NAME_MAX);
+        rcu_read_unlock();
+}
+/* Change congestion control for socket */
+int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
+{
+        struct tcp_congestion_ops *ca;
+        int err = 0;
+        rcu_read_lock();
+        ca = tcp_ca_find(name);
+        if (ca == tp->ca_ops)
+                goto out;
+        if (!ca)
+                err = -ENOENT;
+        else if (!try_module_get(ca->owner))
+                err = -EBUSY;
+        else {
+                tcp_cleanup_congestion_control(tp);
+                tp->ca_ops = ca;
+                if (tp->ca_ops->init)
+                        tp->ca_ops->init(tp);
+        }
+ out:
+        rcu_read_unlock();
+        return err;
+}
+/*
+ * TCP Reno congestion control
+ * This is special case used for fallback as well.
+ */
+/* This is Jacobson's slow start and congestion avoidance.
+ * SIGCOMM '88, p. 328.
+ */
+void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
+                         int flag)
+{
+        if (in_flight < tp->snd_cwnd)
+                return;
+        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+                /* In "safe" area, increase. */
+                if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                        tp->snd_cwnd++;
+        } else {
+                /* In dangerous area, increase slowly.
+                 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+                 */
+                if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+                        if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                                tp->snd_cwnd++;
+                        tp->snd_cwnd_cnt = 0;
+                } else
+                        tp->snd_cwnd_cnt++;
+        }
+}
+EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
+/* Slow start threshold is half the congestion window (min 2) */
+u32 tcp_reno_ssthresh(struct tcp_sock *tp)
+{
+        return max(tp->snd_cwnd >> 1U, 2U);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
+/* Lower bound on congestion window. */
+u32 tcp_reno_min_cwnd(struct tcp_sock *tp)
+{
+        return tp->snd_ssthresh/2;
+}
+EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
+struct tcp_congestion_ops tcp_reno = {
+        .name           = "reno",
+        .owner          = THIS_MODULE,
+        .ssthresh       = tcp_reno_ssthresh,
+        .cong_avoid     = tcp_reno_cong_avoid,
+        .min_cwnd       = tcp_reno_min_cwnd,
+};
+/* Initial congestion control used (until SYN)
+ * really reno under another name so we can tell difference
+ * during tcp_set_default_congestion_control
+ */
+struct tcp_congestion_ops tcp_init_congestion_ops  = {
+        .name           = "",
+        .owner          = THIS_MODULE,
+        .ssthresh       = tcp_reno_ssthresh,
+        .cong_avoid     = tcp_reno_cong_avoid,
+        .min_cwnd       = tcp_reno_min_cwnd,
+};
+EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 634befc07921..f66945cb158f 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -42,15 +42,8 @@ struct tcpdiag_entry
 static struct sock *tcpnl;
 #define TCPDIAG_PUT(skb, attrtype, attrlen) \
-({ int rtalen = RTA_LENGTH(attrlen);        \
+        RTA_DATA(__RTA_PUT(skb, attrtype, attrlen))
-   struct rtattr *rta;                      \
-   if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \
-   rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \
-   rta->rta_type = attrtype;                \
-   rta->rta_len = rtalen;                   \
-   RTA_DATA(rta); })
 static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
                        int ext, u32 pid, u32 seq, u16 nlmsg_flags)
@@ -61,7 +54,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
        struct nlmsghdr  *nlh;
        struct tcp_info  *info = NULL;
        struct tcpdiag_meminfo  *minfo = NULL;
-        struct tcpvegas_info *vinfo = NULL;
        unsigned char    *b = skb->tail;
        nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
@@ -73,9 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
                if (ext & (1<<(TCPDIAG_INFO-1)))
                        info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
                
-                if ((tcp_is_westwood(tp) || tcp_is_vegas(tp))
+                if (ext & (1<<(TCPDIAG_CONG-1))) {
-                    && (ext & (1<<(TCPDIAG_VEGASINFO-1))))
+                        size_t len = strlen(tp->ca_ops->name);
-                        vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo));
+                        strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
+                               tp->ca_ops->name);
+                }
        }
        r->tcpdiag_family = sk->sk_family;
        r->tcpdiag_state = sk->sk_state;
@@ -166,23 +160,13 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
        if (info) 
                tcp_get_info(sk, info);
-        if (vinfo) {
+        if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
-                if (tcp_is_vegas(tp)) {
+                tp->ca_ops->get_info(tp, ext, skb);
-                        vinfo->tcpv_enabled = tp->vegas.doing_vegas_now;
-                        vinfo->tcpv_rttcnt = tp->vegas.cntRTT;
-                        vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT);
-                        vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT);
-                } else {
-                        vinfo->tcpv_enabled = 0;
-                        vinfo->tcpv_rttcnt = 0;
-                        vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt);
-                        vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min);
-                }
-        }
        nlh->nlmsg_len = skb->tail - b;
        return skb->len;
+rtattr_failure:
 nlmsg_failure:
        skb_trim(skb, b - skb->data);
        return -1;
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
new file mode 100644
index 000000000000..36c51f8136bf
--- /dev/null
+++ b/net/ipv4/tcp_highspeed.c
@@ -0,0 +1,181 @@
+/*
+ * Sally Floyd's High Speed TCP (RFC 3649) congestion control
+ *
+ * See http://www.icir.org/floyd/hstcp.html
+ *
+ * John Heffner <jheffner@psc.edu>
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+/* From AIMD tables from RFC 3649 appendix B,
+ * with fixed-point MD scaled <<8.
+ */
+static const struct hstcp_aimd_val {
+        unsigned int cwnd;
+        unsigned int md;
+} hstcp_aimd_vals[] = {
+ {     38,  128, /*  0.50 */ },
+ {    118,  112, /*  0.44 */ },
+ {    221,  104, /*  0.41 */ },
+ {    347,   98, /*  0.38 */ },
+ {    495,   93, /*  0.37 */ },
+ {    663,   89, /*  0.35 */ },
+ {    851,   86, /*  0.34 */ },
+ {   1058,   83, /*  0.33 */ },
+ {   1284,   81, /*  0.32 */ },
+ {   1529,   78, /*  0.31 */ },
+ {   1793,   76, /*  0.30 */ },
+ {   2076,   74, /*  0.29 */ },
+ {   2378,   72, /*  0.28 */ },
+ {   2699,   71, /*  0.28 */ },
+ {   3039,   69, /*  0.27 */ },
+ {   3399,   68, /*  0.27 */ },
+ {   3778,   66, /*  0.26 */ },
+ {   4177,   65, /*  0.26 */ },
+ {   4596,   64, /*  0.25 */ },
+ {   5036,   62, /*  0.25 */ },
+ {   5497,   61, /*  0.24 */ },
+ {   5979,   60, /*  0.24 */ },
+ {   6483,   59, /*  0.23 */ },
+ {   7009,   58, /*  0.23 */ },
+ {   7558,   57, /*  0.22 */ },
+ {   8130,   56, /*  0.22 */ },
+ {   8726,   55, /*  0.22 */ },
+ {   9346,   54, /*  0.21 */ },
+ {   9991,   53, /*  0.21 */ },
+ {  10661,   52, /*  0.21 */ },
+ {  11358,   52, /*  0.20 */ },
+ {  12082,   51, /*  0.20 */ },
+ {  12834,   50, /*  0.20 */ },
+ {  13614,   49, /*  0.19 */ },
+ {  14424,   48, /*  0.19 */ },
+ {  15265,   48, /*  0.19 */ },
+ {  16137,   47, /*  0.19 */ },
+ {  17042,   46, /*  0.18 */ },
+ {  17981,   45, /*  0.18 */ },
+ {  18955,   45, /*  0.18 */ },
+ {  19965,   44, /*  0.17 */ },
+ {  21013,   43, /*  0.17 */ },
+ {  22101,   43, /*  0.17 */ },
+ {  23230,   42, /*  0.17 */ },
+ {  24402,   41, /*  0.16 */ },
+ {  25618,   41, /*  0.16 */ },
+ {  26881,   40, /*  0.16 */ },
+ {  28193,   39, /*  0.16 */ },
+ {  29557,   39, /*  0.15 */ },
+ {  30975,   38, /*  0.15 */ },
+ {  32450,   38, /*  0.15 */ },
+ {  33986,   37, /*  0.15 */ },
+ {  35586,   36, /*  0.14 */ },
+ {  37253,   36, /*  0.14 */ },
+ {  38992,   35, /*  0.14 */ },
+ {  40808,   35, /*  0.14 */ },
+ {  42707,   34, /*  0.13 */ },
+ {  44694,   33, /*  0.13 */ },
+ {  46776,   33, /*  0.13 */ },
+ {  48961,   32, /*  0.13 */ },
+ {  51258,   32, /*  0.13 */ },
+ {  53677,   31, /*  0.12 */ },
+ {  56230,   30, /*  0.12 */ },
+ {  58932,   30, /*  0.12 */ },
+ {  61799,   29, /*  0.12 */ },
+ {  64851,   28, /*  0.11 */ },
+ {  68113,   28, /*  0.11 */ },
+ {  71617,   27, /*  0.11 */ },
+ {  75401,   26, /*  0.10 */ },
+ {  79517,   26, /*  0.10 */ },
+ {  84035,   25, /*  0.10 */ },
+ {  89053,   24, /*  0.10 */ },
+};
+#define HSTCP_AIMD_MAX  ARRAY_SIZE(hstcp_aimd_vals)
+struct hstcp {
+        u32     ai;
+};
+static void hstcp_init(struct tcp_sock *tp)
+{
+        struct hstcp *ca = tcp_ca(tp);
+        ca->ai = 0;
+        /* Ensure the MD arithmetic works.  This is somewhat pedantic,
+         * since I don't think we will see a cwnd this large. :) */
+        tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
+}
+static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt,
+                             u32 in_flight, int good)
+{
+        struct hstcp *ca = tcp_ca(tp);
+        if (in_flight < tp->snd_cwnd)
+                return;
+        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+                if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                        tp->snd_cwnd++;
+        } else {
+                /* Update AIMD parameters */
+                if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
+                        while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
+                               ca->ai < HSTCP_AIMD_MAX)
+                                ca->ai++;
+                } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) {
+                        while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
+                               ca->ai > 0)
+                                ca->ai--;
+                }
+                /* Do additive increase */
+                if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
+                        tp->snd_cwnd_cnt += ca->ai;
+                        if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+                                tp->snd_cwnd++;
+                                tp->snd_cwnd_cnt -= tp->snd_cwnd;
+                        }
+                }
+        }
+}
+static u32 hstcp_ssthresh(struct tcp_sock *tp)
+{
+        struct hstcp *ca = tcp_ca(tp);
+        /* Do multiplicative decrease */
+        return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
+}
+static struct tcp_congestion_ops tcp_highspeed = {
+        .init           = hstcp_init,
+        .ssthresh       = hstcp_ssthresh,
+        .cong_avoid     = hstcp_cong_avoid,
+        .min_cwnd       = tcp_reno_min_cwnd,
+        .owner          = THIS_MODULE,
+        .name           = "highspeed"
+};
+static int __init hstcp_register(void)
+{
+        BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE);
+        return tcp_register_congestion_control(&tcp_highspeed);
+}
+static void __exit hstcp_unregister(void)
+{
+        tcp_unregister_congestion_control(&tcp_highspeed);
+}
+module_init(hstcp_register);
+module_exit(hstcp_unregister);
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("High Speed TCP");
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
new file mode 100644
index 000000000000..40168275acf9
--- /dev/null
+++ b/net/ipv4/tcp_htcp.c
@@ -0,0 +1,289 @@
+/*
+ * H-TCP congestion control. The algorithm is detailed in:
+ * R.N.Shorten, D.J.Leith:
+ *   "H-TCP: TCP for high-speed and long-distance networks"
+ *   Proc. PFLDnet, Argonne, 2004.
+ * http://www.hamilton.ie/net/htcp3.pdf
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+#define ALPHA_BASE      (1<<7)  /* 1.0 with shift << 7 */
+#define BETA_MIN        (1<<6)  /* 0.5 with shift << 7 */
+#define BETA_MAX        102     /* 0.8 with shift << 7 */
+static int use_rtt_scaling = 1;
+module_param(use_rtt_scaling, int, 0644);
+MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
+static int use_bandwidth_switch = 1;
+module_param(use_bandwidth_switch, int, 0644);
+MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
+struct htcp {
+        u16     alpha;          /* Fixed point arith, << 7 */
+        u8      beta;           /* Fixed point arith, << 7 */
+        u8      modeswitch;     /* Delay modeswitch until we had at least one congestion event */
+        u8      ccount;         /* Number of RTTs since last congestion event */
+        u8      undo_ccount;
+        u16     packetcount;
+        u32     minRTT;
+        u32     maxRTT;
+        u32     snd_cwnd_cnt2;
+        u32     undo_maxRTT;
+        u32     undo_old_maxB;
+        /* Bandwidth estimation */
+        u32     minB;
+        u32     maxB;
+        u32     old_maxB;
+        u32     Bi;
+        u32     lasttime;
+};
+static inline void htcp_reset(struct htcp *ca)
+{
+        ca->undo_ccount = ca->ccount;
+        ca->undo_maxRTT = ca->maxRTT;
+        ca->undo_old_maxB = ca->old_maxB;
+        ca->ccount = 0;
+        ca->snd_cwnd_cnt2 = 0;
+}
+static u32 htcp_cwnd_undo(struct tcp_sock *tp)
+{
+        struct htcp *ca = tcp_ca(tp);
+        ca->ccount = ca->undo_ccount;
+        ca->maxRTT = ca->undo_maxRTT;
+        ca->old_maxB = ca->undo_old_maxB;
+        return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta);
+}
+static inline void measure_rtt(struct tcp_sock *tp)
+{
+        struct htcp *ca = tcp_ca(tp);
+        u32 srtt = tp->srtt>>3;
+        /* keep track of minimum RTT seen so far, minRTT is zero at first */
+        if (ca->minRTT > srtt || !ca->minRTT)
+                ca->minRTT = srtt;
+        /* max RTT */
+        if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) {
+                if (ca->maxRTT < ca->minRTT)
+                        ca->maxRTT = ca->minRTT;
+                if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50)
+                        ca->maxRTT = srtt;
+        }
+}
+static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked)
+{
+        struct htcp *ca = tcp_ca(tp);
+        u32 now = tcp_time_stamp;
+        /* achieved throughput calculations */
+        if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) {
+                ca->packetcount = 0;
+                ca->lasttime = now;
+                return;
+        }
+        ca->packetcount += pkts_acked;
+        if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1)
+                        && now - ca->lasttime >= ca->minRTT
+                        && ca->minRTT > 0) {
+                __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime);
+                if (ca->ccount <= 3) {
+                        /* just after backoff */
+                        ca->minB = ca->maxB = ca->Bi = cur_Bi;
+                } else {
+                        ca->Bi = (3*ca->Bi + cur_Bi)/4;
+                        if (ca->Bi > ca->maxB)
+                                ca->maxB = ca->Bi;
+                        if (ca->minB > ca->maxB)
+                                ca->minB = ca->maxB;
+                }
+                ca->packetcount = 0;
+                ca->lasttime = now;
+        }
+}
+static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
+{
+        if (use_bandwidth_switch) {
+                u32 maxB = ca->maxB;
+                u32 old_maxB = ca->old_maxB;
+                ca->old_maxB = ca->maxB;
+                if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) {
+                        ca->beta = BETA_MIN;
+                        ca->modeswitch = 0;
+                        return;
+                }
+        }
+        if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) {
+                ca->beta = (minRTT<<7)/maxRTT;
+                if (ca->beta < BETA_MIN)
+                        ca->beta = BETA_MIN;
+                else if (ca->beta > BETA_MAX)
+                        ca->beta = BETA_MAX;
+        } else {
+                ca->beta = BETA_MIN;
+                ca->modeswitch = 1;
+        }
+}
+static inline void htcp_alpha_update(struct htcp *ca)
+{
+        u32 minRTT = ca->minRTT;
+        u32 factor = 1;
+        u32 diff = ca->ccount * minRTT; /* time since last backoff */
+        if (diff > HZ) {
+                diff -= HZ;
+                factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ;
+        }
+        if (use_rtt_scaling && minRTT) {
+                u32 scale = (HZ<<3)/(10*minRTT);
+                scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */
+                factor = (factor<<3)/scale;
+                if (!factor)
+                        factor = 1;
+        }
+        ca->alpha = 2*factor*((1<<7)-ca->beta);
+        if (!ca->alpha)
+                ca->alpha = ALPHA_BASE;
+}
+/* After we have the rtt data to calculate beta, we'd still prefer to wait one
+ * rtt before we adjust our beta to ensure we are working from a consistent
+ * data.
+ *
+ * This function should be called when we hit a congestion event since only at
+ * that point do we really have a real sense of maxRTT (the queues en route
+ * were getting just too full now).
+ */
+static void htcp_param_update(struct tcp_sock *tp)
+{
+        struct htcp *ca = tcp_ca(tp);
+        u32 minRTT = ca->minRTT;
+        u32 maxRTT = ca->maxRTT;
+        htcp_beta_update(ca, minRTT, maxRTT);
+        htcp_alpha_update(ca);
+        /* add slowly fading memory for maxRTT to accommodate routing changes etc */
+        if (minRTT > 0 && maxRTT > minRTT)
+                ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100;
+}
+static u32 htcp_recalc_ssthresh(struct tcp_sock *tp)
+{
+        struct htcp *ca = tcp_ca(tp);
+        htcp_param_update(tp);
+        return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
+}
+static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+                            u32 in_flight, int data_acked)
+{
+        struct htcp *ca = tcp_ca(tp);
+        if (in_flight < tp->snd_cwnd)
+                return;
+        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+                /* In "safe" area, increase. */
+                if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                        tp->snd_cwnd++;
+        } else {
+                measure_rtt(tp);
+                /* keep track of number of round-trip times since last backoff event */
+                if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) {
+                        ca->ccount++;
+                        ca->snd_cwnd_cnt2 = 0;
+                        htcp_alpha_update(ca);
+                }
+                /* In dangerous area, increase slowly.
+                 * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
+                 */
+                if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) {
+                        if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+                                tp->snd_cwnd++;
+                        tp->snd_cwnd_cnt = 0;
+                        ca->ccount++;
+                }
+        }
+}
+/* Lower bound on congestion window. */
+static u32 htcp_min_cwnd(struct tcp_sock *tp)
+{
+        return tp->snd_ssthresh;
+}
+static void htcp_init(struct tcp_sock *tp)
+{
+        struct htcp *ca = tcp_ca(tp);
+        memset(ca, 0, sizeof(struct htcp));
+        ca->alpha = ALPHA_BASE;
+        ca->beta = BETA_MIN;
+}
+static void htcp_state(struct tcp_sock *tp, u8 new_state)
+{
+        switch (new_state) {
+        case TCP_CA_CWR:
+        case TCP_CA_Recovery:
+        case TCP_CA_Loss:
+                htcp_reset(tcp_ca(tp));
+                break;
+        }
+}
+static struct tcp_congestion_ops htcp = {
+        .init           = htcp_init,
+        .ssthresh       = htcp_recalc_ssthresh,
+        .min_cwnd       = htcp_min_cwnd,
+        .cong_avoid     = htcp_cong_avoid,
+        .set_state      = htcp_state,
+        .undo_cwnd      = htcp_cwnd_undo,
+        .pkts_acked     = measure_achieved_throughput,
+        .owner          = THIS_MODULE,
+        .name           = "htcp",
+};
+static int __init htcp_register(void)
+{
+        BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE);
+        BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
+        if (!use_bandwidth_switch)
+                htcp.pkts_acked = NULL;
+        return tcp_register_congestion_control(&htcp);
+}
+static void __exit htcp_unregister(void)
+{
+        tcp_unregister_congestion_control(&htcp);
+}
+module_init(htcp_register);
+module_exit(htcp_unregister);
+MODULE_AUTHOR("Baruch Even");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("H-TCP");
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
new file mode 100644
index 000000000000..13a66342c304
--- /dev/null
+++ b/net/ipv4/tcp_hybla.c
@@ -0,0 +1,187 @@
+/*
+ * TCP HYBLA
+ *
+ * TCP-HYBLA Congestion control algorithm, based on:
+ *   C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
+ *   for Heterogeneous Networks",
+ *   International Journal on satellite Communications,
+ *                                     September 2004
+ *    Daniele Lacamera
+ *    root at danielinux.net
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+/* Tcp Hybla structure. */
+struct hybla {
+        u8    hybla_en;
+        u32   snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
+        u32   rho;            /* Rho parameter, integer part  */
+        u32   rho2;           /* Rho * Rho, integer part */
+        u32   rho_3ls;        /* Rho parameter, <<3 */
+        u32   rho2_7ls;       /* Rho^2, <<7     */
+        u32   minrtt;         /* Minimum smoothed round trip time value seen */
+};
+/* Hybla reference round trip time (default= 1/40 sec = 25 ms),
+   expressed in jiffies */
+static int rtt0 = 25;
+module_param(rtt0, int, 0644);
+MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
+/* This is called to refresh values for hybla parameters */
+static inline void hybla_recalc_param (struct tcp_sock *tp)
+{
+        struct hybla *ca = tcp_ca(tp);
+        ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8);
+        ca->rho = ca->rho_3ls >> 3;
+        ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
+        ca->rho2 = ca->rho2_7ls >>7;
+}
+static void hybla_init(struct tcp_sock *tp)
+{
+        struct hybla *ca = tcp_ca(tp);
+        ca->rho = 0;
+        ca->rho2 = 0;
+        ca->rho_3ls = 0;
+        ca->rho2_7ls = 0;
+        ca->snd_cwnd_cents = 0;
+        ca->hybla_en = 1;
+        tp->snd_cwnd = 2;
+        tp->snd_cwnd_clamp = 65535;
+        /* 1st Rho measurement based on initial srtt */
+        hybla_recalc_param(tp);
+        /* set minimum rtt as this is the 1st ever seen */
+        ca->minrtt = tp->srtt;
+        tp->snd_cwnd = ca->rho;
+}
+static void hybla_state(struct tcp_sock *tp, u8 ca_state)
+{
+        struct hybla *ca = tcp_ca(tp);
+        ca->hybla_en = (ca_state == TCP_CA_Open);
+}
+static inline u32 hybla_fraction(u32 odds)
+{
+        static const u32 fractions[] = {
+                128, 139, 152, 165, 181, 197, 215, 234,
+        };
+        return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
+}
+/* TCP Hybla main routine.
+ * This is the algorithm behavior:
+ *     o Recalc Hybla parameters if min_rtt has changed
+ *     o Give cwnd a new value based on the model proposed
+ *     o remember increments <1
+ */
+static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+                            u32 in_flight, int flag)
+{
+        struct hybla *ca = tcp_ca(tp);
+        u32 increment, odd, rho_fractions;
+        int is_slowstart = 0;
+        /*  Recalculate rho only if this srtt is the lowest */
+        if (tp->srtt < ca->minrtt){
+                hybla_recalc_param(tp);
+                ca->minrtt = tp->srtt;
+        }
+        if (!ca->hybla_en)
+                return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag);
+        if (in_flight < tp->snd_cwnd)
+                return;
+        if (ca->rho == 0)
+                hybla_recalc_param(tp);
+        rho_fractions = ca->rho_3ls - (ca->rho << 3);
+        if (tp->snd_cwnd < tp->snd_ssthresh) {
+                /*
+                 * slow start
+                 *      INC = 2^RHO - 1
+                 * This is done by splitting the rho parameter
+                 * into 2 parts: an integer part and a fraction part.
+                 * Inrement<<7 is estimated by doing:
+                 *             [2^(int+fract)]<<7
+                 * that is equal to:
+                 *             (2^int)  *  [(2^fract) <<7]
+                 * 2^int is straightly computed as 1<<int,
+                 * while we will use hybla_slowstart_fraction_increment() to
+                 * calculate 2^fract in a <<7 value.
+                 */
+                is_slowstart = 1;
+                increment = ((1 << ca->rho) * hybla_fraction(rho_fractions))
+                        - 128;
+        } else {
+                /*
+                 * congestion avoidance
+                 * INC = RHO^2 / W
+                 * as long as increment is estimated as (rho<<7)/window
+                 * it already is <<7 and we can easily count its fractions.
+                 */
+                increment = ca->rho2_7ls / tp->snd_cwnd;
+                if (increment < 128)
+                        tp->snd_cwnd_cnt++;
+        }
+        odd = increment % 128;
+        tp->snd_cwnd += increment >> 7;
+        ca->snd_cwnd_cents += odd;
+        /* check when fractions goes >=128 and increase cwnd by 1. */
+        while(ca->snd_cwnd_cents >= 128) {
+                tp->snd_cwnd++;
+                ca->snd_cwnd_cents -= 128;
+                tp->snd_cwnd_cnt = 0;
+        }
+        /* clamp down slowstart cwnd to ssthresh value. */
+        if (is_slowstart)
+                tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+        tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+}
+static struct tcp_congestion_ops tcp_hybla = {
+        .init           = hybla_init,
+        .ssthresh       = tcp_reno_ssthresh,
+        .min_cwnd       = tcp_reno_min_cwnd,
+        .cong_avoid     = hybla_cong_avoid,
+        .set_state      = hybla_state,
+        .owner          = THIS_MODULE,
+        .name           = "hybla"
+};
+static int __init hybla_register(void)
+{
+        BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE);
+        return tcp_register_congestion_control(&tcp_hybla);
+}
+static void __exit hybla_unregister(void)
+{
+        tcp_unregister_congestion_control(&tcp_hybla);
+}
+module_init(hybla_register);
+module_exit(hybla_unregister);
+MODULE_AUTHOR("Daniele Lacamera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Hybla");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5bad504630a3..53a8a5399f1e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,7 +61,6 @@
 *              Panu Kuhlberg:          Experimental audit of TCP (re)transmission
 *                                      engine. Lots of bugs are found.
 *              Pasi Sarolahti:         F-RTO for dealing with spurious RTOs
- *              Angelo Dell'Aera:       TCP Westwood+ support
 */
 #include <linux/config.h>
@@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337;
 int sysctl_tcp_max_orphans = NR_FILE;
 int sysctl_tcp_frto;
 int sysctl_tcp_nometrics_save;
-int sysctl_tcp_westwood;
-int sysctl_tcp_vegas_cong_avoid;
 int sysctl_tcp_moderate_rcvbuf = 1;
-/* Default values of the Vegas variables, in fixed-point representation
- * with V_PARAM_SHIFT bits to the right of the binary point.
- */
-#define V_PARAM_SHIFT 1
-int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
-int sysctl_tcp_vegas_beta  = 3<<V_PARAM_SHIFT;
-int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
-int sysctl_tcp_bic = 1;
-int sysctl_tcp_bic_fast_convergence = 1;
-int sysctl_tcp_bic_low_window = 14;
-int sysctl_tcp_bic_beta = 819;          /* = 819/1024 (BICTCP_BETA_SCALE) */
 #define FLAG_DATA               0x01 /* Incoming frame contained data.          */
 #define FLAG_WIN_UPDATE         0x02 /* Incoming ACK was a window update.       */
 #define FLAG_DATA_ACKED         0x04 /* This ACK acknowledged new data.         */
@@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk)
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
-static void init_bictcp(struct tcp_sock *tp)
-{
-        tp->bictcp.cnt = 0;
-        tp->bictcp.last_max_cwnd = 0;
-        tp->bictcp.last_cwnd = 0;
-        tp->bictcp.last_stamp = 0;
-}
 /* 5. Recalculate window clamp after socket hit its memory bounds. */
 static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
 {
@@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
                tcp_grow_window(sk, tp, skb);
 }
-/* When starting a new connection, pin down the current choice of 
- * congestion algorithm.
- */
-void tcp_ca_init(struct tcp_sock *tp)
-{
-        if (sysctl_tcp_westwood) 
-                tp->adv_cong = TCP_WESTWOOD;
-        else if (sysctl_tcp_bic)
-                tp->adv_cong = TCP_BIC;
-        else if (sysctl_tcp_vegas_cong_avoid) {
-                tp->adv_cong = TCP_VEGAS;
-                tp->vegas.baseRTT = 0x7fffffff;
-                tcp_vegas_enable(tp);
-        } 
-}
-/* Do RTT sampling needed for Vegas.
- * Basically we:
- *   o min-filter RTT samples from within an RTT to get the current
- *     propagation delay + queuing delay (we are min-filtering to try to
- *     avoid the effects of delayed ACKs)
- *   o min-filter RTT samples from a much longer window (forever for now)
- *     to find the propagation delay (baseRTT)
- */
-static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
-{
-        __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
-        /* Filter to find propagation delay: */
-        if (vrtt < tp->vegas.baseRTT) 
-                tp->vegas.baseRTT = vrtt;
-        /* Find the min RTT during the last RTT to find
-         * the current prop. delay + queuing delay:
-         */
-        tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
-        tp->vegas.cntRTT++;
-}
 /* Called to compute a smoothed rtt estimate. The data fed to this
 * routine either comes from timestamps, or from segments that were
 * known _not_ to have been retransmitted [see Karn/Partridge
@@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
 * To save cycles in the RFC 1323 implementation it was better to break
 * it up into three procedures. -- erics
 */
-static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
+static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
 {
        long m = mrtt; /* RTT */
-        if (tcp_vegas_enabled(tp))
-                vegas_rtt_calc(tp, mrtt);
        /*      The following amusing code comes from Jacobson's
         *      article in SIGCOMM '88.  Note that rtt and mdev
         *      are scaled versions of rtt and mean deviation.
@@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
                tp->rtt_seq = tp->snd_nxt;
        }
-        tcp_westwood_update_rtt(tp, tp->srtt >> 3);
+        if (tp->ca_ops->rtt_sample)
+                tp->ca_ops->rtt_sample(tp, *usrtt);
 }
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
@@ -805,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
        __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
        if (!cwnd) {
-                if (tp->mss_cache_std > 1460)
+                if (tp->mss_cache > 1460)
                        cwnd = 2;
                else
-                        cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
+                        cwnd = (tp->mss_cache > 1095) ? 3 : 4;
        }
        return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
@@ -979,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
        if (sk->sk_route_caps & NETIF_F_TSO) {
                sk->sk_route_caps &= ~NETIF_F_TSO;
                sock_set_flag(sk, SOCK_NO_LARGESEND);
-                tp->mss_cache = tp->mss_cache_std;
+                tp->mss_cache = tp->mss_cache;
        }
        if (!tp->sacked_out)
@@ -1142,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
                            (IsFack(tp) ||
                             !before(lost_retrans,
                                     TCP_SKB_CB(skb)->ack_seq + tp->reordering *
-                                     tp->mss_cache_std))) {
+                                     tp->mss_cache))) {
                                TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
                                tp->retrans_out -= tcp_skb_pcount(skb);
@@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk)
            tp->snd_una == tp->high_seq ||
            (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
                tp->prior_ssthresh = tcp_current_ssthresh(tp);
-                if (!tcp_westwood_ssthresh(tp))
+                tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
-                        tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+                tcp_ca_event(tp, CA_EVENT_FRTO);
        }
        /* Have to clear retransmission markers here to keep the bookkeeping
@@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk)
        tcp_set_ca_state(tp, TCP_CA_Loss);
        tp->high_seq = tp->frto_highmark;
        TCP_ECN_queue_cwr(tp);
-        init_bictcp(tp);
 }
 void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how)
        if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
            (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
                tp->prior_ssthresh = tcp_current_ssthresh(tp);
-                tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+                tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
+                tcp_ca_event(tp, CA_EVENT_LOSS);
        }
        tp->snd_cwnd       = 1;
        tp->snd_cwnd_cnt   = 0;
@@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
 }
 /* Decrease cwnd each second ack. */
 static void tcp_cwnd_down(struct tcp_sock *tp)
 {
        int decr = tp->snd_cwnd_cnt + 1;
-        __u32 limit;
-        /*
-         * TCP Westwood
-         * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
-         * in packets we use mss_cache). If sysctl_tcp_westwood is off
-         * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
-         * still used as usual. It prevents other strange cases in which
-         * BWE*RTTmin could assume value 0. It should not happen but...
-         */
-        if (!(limit = tcp_westwood_bw_rttmin(tp)))
-                limit = tp->snd_ssthresh/2;
        tp->snd_cwnd_cnt = decr&1;
        decr >>= 1;
-        if (decr && tp->snd_cwnd > limit)
+        if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
                tp->snd_cwnd -= decr;
        tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
 static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
 {
        if (tp->prior_ssthresh) {
-                if (tcp_is_bic(tp))
+                if (tp->ca_ops->undo_cwnd)
-                        tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd);
+                        tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
                else
                        tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
@@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
 static inline void tcp_complete_cwr(struct tcp_sock *tp)
 {
-        if (tcp_westwood_cwnd(tp)) 
+        tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
-                tp->snd_ssthresh = tp->snd_cwnd;
-        else
-                tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
        tp->snd_cwnd_stamp = tcp_time_stamp;
+        tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
 }
 static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
                if (tp->ca_state < TCP_CA_CWR) {
                        if (!(flag&FLAG_ECE))
                                tp->prior_ssthresh = tcp_current_ssthresh(tp);
-                        tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+                        tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
                        TCP_ECN_queue_cwr(tp);
                }
@@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 /* Read draft-ietf-tcplw-high-performance before mucking
 * with this code. (Superceeds RFC1323)
 */
-static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
+static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
 {
        __u32 seq_rtt;
@@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
         * in window is lost... Voila.                          --ANK (010210)
         */
        seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
-        tcp_rtt_estimator(tp, seq_rtt);
+        tcp_rtt_estimator(tp, seq_rtt, usrtt);
        tcp_set_rto(tp);
        tp->backoff = 0;
        tcp_bound_rto(tp);
 }
-static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
+static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
 {
        /* We don't have a timestamp. Can only use
         * packets that are not retransmitted to determine
@@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
        if (flag & FLAG_RETRANS_DATA_ACKED)
                return;
-        tcp_rtt_estimator(tp, seq_rtt);
+        tcp_rtt_estimator(tp, seq_rtt, usrtt);
        tcp_set_rto(tp);
        tp->backoff = 0;
        tcp_bound_rto(tp);
 }
 static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
-                                      int flag, s32 seq_rtt)
+                                      int flag, s32 seq_rtt, u32 *usrtt)
 {
        /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
        if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
-                tcp_ack_saw_tstamp(tp, flag);
+                tcp_ack_saw_tstamp(tp, usrtt, flag);
        else if (seq_rtt >= 0)
-                tcp_ack_no_tstamp(tp, seq_rtt, flag);
+                tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
-}
-/*
- * Compute congestion window to use.
- *
- * This is from the implementation of BICTCP in
- * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
- *  "Binary Increase Congestion Control for Fast, Long Distance
- *  Networks" in InfoComm 2004
- * Available from:
- *  http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
- *
- * Unless BIC is enabled and congestion window is large
- * this behaves the same as the original Reno.
- */
-static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
-{
-        /* orignal Reno behaviour */
-        if (!tcp_is_bic(tp))
-                return tp->snd_cwnd;
-        if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
-           (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
-                return tp->bictcp.cnt;
-        tp->bictcp.last_cwnd = tp->snd_cwnd;
-        tp->bictcp.last_stamp = tcp_time_stamp;
-      
-        /* start off normal */
-        if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
-                tp->bictcp.cnt = tp->snd_cwnd;
-        /* binary increase */
-        else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
-                __u32   dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
-                        / BICTCP_B;
-                if (dist > BICTCP_MAX_INCREMENT)
-                        /* linear increase */
-                        tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
-                else if (dist <= 1U)
-                        /* binary search increase */
-                        tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
-                                / BICTCP_B;
-                else
-                        /* binary search increase */
-                        tp->bictcp.cnt = tp->snd_cwnd / dist;
-        } else {
-                /* slow start amd linear increase */
-                if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
-                        /* slow start */
-                        tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
-                                / BICTCP_B;
-                else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
-                                        + BICTCP_MAX_INCREMENT*(BICTCP_B-1))
-                        /* slow start */
-                        tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
-                                / (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
-                else
-                        /* linear increase */
-                        tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
-        }
-        return tp->bictcp.cnt;
 }
-/* This is Jacobson's slow start and congestion avoidance. 
+static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
- * SIGCOMM '88, p. 328.
+                                  u32 in_flight, int good)
- */
-static inline void reno_cong_avoid(struct tcp_sock *tp)
 {
-        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+        tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
-                /* In "safe" area, increase. */
-                if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-                        tp->snd_cwnd++;
-        } else {
-                /* In dangerous area, increase slowly.
-                 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
-                 */
-                if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
-                        if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-                                tp->snd_cwnd++;
-                        tp->snd_cwnd_cnt=0;
-                } else
-                        tp->snd_cwnd_cnt++;
-        }
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
-/* This is based on the congestion detection/avoidance scheme described in
- *    Lawrence S. Brakmo and Larry L. Peterson.
- *    "TCP Vegas: End to end congestion avoidance on a global internet."
- *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
- *    October 1995. Available from:
- *      ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
- *
- * See http://www.cs.arizona.edu/xkernel/ for their implementation.
- * The main aspects that distinguish this implementation from the
- * Arizona Vegas implementation are:
- *   o We do not change the loss detection or recovery mechanisms of
- *     Linux in any way. Linux already recovers from losses quite well,
- *     using fine-grained timers, NewReno, and FACK.
- *   o To avoid the performance penalty imposed by increasing cwnd
- *     only every-other RTT during slow start, we increase during
- *     every RTT during slow start, just like Reno.
- *   o Largely to allow continuous cwnd growth during slow start,
- *     we use the rate at which ACKs come back as the "actual"
- *     rate, rather than the rate at which data is sent.
- *   o To speed convergence to the right rate, we set the cwnd
- *     to achieve the right ("actual") rate when we exit slow start.
- *   o To filter out the noise caused by delayed ACKs, we use the
- *     minimum RTT sample observed during the last RTT to calculate
- *     the actual rate.
- *   o When the sender re-starts from idle, it waits until it has
- *     received ACKs for an entire flight of new data before making
- *     a cwnd adjustment decision. The original Vegas implementation
- *     assumed senders never went idle.
- */
-static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
-{
-        /* The key players are v_beg_snd_una and v_beg_snd_nxt.
-         *
-         * These are so named because they represent the approximate values
-         * of snd_una and snd_nxt at the beginning of the current RTT. More
-         * precisely, they represent the amount of data sent during the RTT.
-         * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
-         * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
-         * bytes of data have been ACKed during the course of the RTT, giving
-         * an "actual" rate of:
-         *
-         *     (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
-         *
-         * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
-         * because delayed ACKs can cover more than one segment, so they
-         * don't line up nicely with the boundaries of RTTs.
-         *
-         * Another unfortunate fact of life is that delayed ACKs delay the
-         * advance of the left edge of our send window, so that the number
-         * of bytes we send in an RTT is often less than our cwnd will allow.
-         * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
-         */
-        if (after(ack, tp->vegas.beg_snd_nxt)) {
-                /* Do the Vegas once-per-RTT cwnd adjustment. */
-                u32 old_wnd, old_snd_cwnd;
-                
-                /* Here old_wnd is essentially the window of data that was
-                 * sent during the previous RTT, and has all
-                 * been acknowledged in the course of the RTT that ended
-                 * with the ACK we just received. Likewise, old_snd_cwnd
-                 * is the cwnd during the previous RTT.
-                 */
-                old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
-                        tp->mss_cache_std;
-                old_snd_cwnd = tp->vegas.beg_snd_cwnd;
-                /* Save the extent of the current window so we can use this
-                 * at the end of the next RTT.
-                 */
-                tp->vegas.beg_snd_una  = tp->vegas.beg_snd_nxt;
-                tp->vegas.beg_snd_nxt  = tp->snd_nxt;
-                tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
-                /* Take into account the current RTT sample too, to
-                 * decrease the impact of delayed acks. This double counts
-                 * this sample since we count it for the next window as well,
-                 * but that's not too awful, since we're taking the min,
-                 * rather than averaging.
-                 */
-                vegas_rtt_calc(tp, seq_rtt);
-                /* We do the Vegas calculations only if we got enough RTT
-                 * samples that we can be reasonably sure that we got
-                 * at least one RTT sample that wasn't from a delayed ACK.
-                 * If we only had 2 samples total,
-                 * then that means we're getting only 1 ACK per RTT, which
-                 * means they're almost certainly delayed ACKs.
-                 * If  we have 3 samples, we should be OK.
-                 */
-                if (tp->vegas.cntRTT <= 2) {
-                        /* We don't have enough RTT samples to do the Vegas
-                         * calculation, so we'll behave like Reno.
-                         */
-                        if (tp->snd_cwnd > tp->snd_ssthresh)
-                                tp->snd_cwnd++;
-                } else {
-                        u32 rtt, target_cwnd, diff;
-                        /* We have enough RTT samples, so, using the Vegas
-                         * algorithm, we determine if we should increase or
-                         * decrease cwnd, and by how much.
-                         */
-                        /* Pluck out the RTT we are using for the Vegas
-                         * calculations. This is the min RTT seen during the
-                         * last RTT. Taking the min filters out the effects
-                         * of delayed ACKs, at the cost of noticing congestion
-                         * a bit later.
-                         */
-                        rtt = tp->vegas.minRTT;
-                        /* Calculate the cwnd we should have, if we weren't
-                         * going too fast.
-                         *
-                         * This is:
-                         *     (actual rate in segments) * baseRTT
-                         * We keep it as a fixed point number with
-                         * V_PARAM_SHIFT bits to the right of the binary point.
-                         */
-                        target_cwnd = ((old_wnd * tp->vegas.baseRTT)
-                                       << V_PARAM_SHIFT) / rtt;
-                        /* Calculate the difference between the window we had,
-                         * and the window we would like to have. This quantity
-                         * is the "Diff" from the Arizona Vegas papers.
-                         *
-                         * Again, this is a fixed point number with
-                         * V_PARAM_SHIFT bits to the right of the binary
-                         * point.
-                         */
-                        diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
-                        if (tp->snd_cwnd < tp->snd_ssthresh) {
-                                /* Slow start.  */
-                                if (diff > sysctl_tcp_vegas_gamma) {
-                                        /* Going too fast. Time to slow down
-                                         * and switch to congestion avoidance.
-                                         */
-                                        tp->snd_ssthresh = 2;
-                                        /* Set cwnd to match the actual rate
-                                         * exactly:
-                                         *   cwnd = (actual rate) * baseRTT
-                                         * Then we add 1 because the integer
-                                         * truncation robs us of full link
-                                         * utilization.
-                                         */
-                                        tp->snd_cwnd = min(tp->snd_cwnd,
-                                                           (target_cwnd >>
-                                                            V_PARAM_SHIFT)+1);
-                                }
-                        } else {
-                                /* Congestion avoidance. */
-                                u32 next_snd_cwnd;
-                                /* Figure out where we would like cwnd
-                                 * to be.
-                                 */
-                                if (diff > sysctl_tcp_vegas_beta) {
-                                        /* The old window was too fast, so
-                                         * we slow down.
-                                         */
-                                        next_snd_cwnd = old_snd_cwnd - 1;
-                                } else if (diff < sysctl_tcp_vegas_alpha) {
-                                        /* We don't have enough extra packets
-                                         * in the network, so speed up.
-                                         */
-                                        next_snd_cwnd = old_snd_cwnd + 1;
-                                } else {
-                                        /* Sending just as fast as we
-                                         * should be.
-                                         */
-                                        next_snd_cwnd = old_snd_cwnd;
-                                }
-                                /* Adjust cwnd upward or downward, toward the
-                                 * desired value.
-                                 */
-                                if (next_snd_cwnd > tp->snd_cwnd)
-                                        tp->snd_cwnd++;
-                                else if (next_snd_cwnd < tp->snd_cwnd)
-                                        tp->snd_cwnd--;
-                        }
-                }
-                /* Wipe the slate clean for the next RTT. */
-                tp->vegas.cntRTT = 0;
-                tp->vegas.minRTT = 0x7fffffff;
-        }
-        /* The following code is executed for every ack we receive,
-         * except for conditions checked in should_advance_cwnd()
-         * before the call to tcp_cong_avoid(). Mainly this means that
-         * we only execute this code if the ack actually acked some
-         * data.
-         */
-        /* If we are in slow start, increase our cwnd in response to this ACK.
-         * (If we are not in slow start then we are in congestion avoidance,
-         * and adjust our congestion window only once per RTT. See the code
-         * above.)
-         */
-        if (tp->snd_cwnd <= tp->snd_ssthresh) 
-                tp->snd_cwnd++;
-        /* to keep cwnd from growing without bound */
-        tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
-        /* Make sure that we are never so timid as to reduce our cwnd below
-         * 2 MSS.
-         *
-         * Going below 2 MSS would risk huge delayed ACKs from our receiver.
-         */
-        tp->snd_cwnd = max(tp->snd_cwnd, 2U);
-        tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
-{
-        if (tcp_vegas_enabled(tp))
-                vegas_cong_avoid(tp, ack, seq_rtt);
-        else
-                reno_cong_avoid(tp);
-}
 /* Restart timer after forward progress on connection.
 * RFC2988 recommends to restart timer to now+rto.
 */
@@ -2348,15 +1957,6 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
        }
 }
-/* There is one downside to this scheme.  Although we keep the
- * ACK clock ticking, adjusting packet counters and advancing
- * congestion window, we do not liberate socket send buffer
- * space.
- *
- * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
- * then making a write space wakeup callback is a possible
- * future enhancement.  WARNING: it is not trivial to make.
- */
 static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
                         __u32 now, __s32 *seq_rtt)
 {
@@ -2415,13 +2015,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
 /* Remove acknowledged frames from the retransmission queue. */
-static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
+static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        __u32 now = tcp_time_stamp;
        int acked = 0;
        __s32 seq_rtt = -1;
+        struct timeval usnow;
+        u32 pkts_acked = 0;
+        if (seq_usrtt)
+                do_gettimeofday(&usnow);
        while ((skb = skb_peek(&sk->sk_write_queue)) &&
               skb != sk->sk_send_head) {
@@ -2433,7 +2038,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
                 * the other end.
                 */
                if (after(scb->end_seq, tp->snd_una)) {
-                        if (tcp_skb_pcount(skb) > 1)
+                        if (tcp_skb_pcount(skb) > 1 &&
+                            after(tp->snd_una, scb->seq))
                                acked |= tcp_tso_acked(sk, skb,
                                                       now, &seq_rtt);
                        break;
@@ -2448,6 +2054,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
                 */
                if (!(scb->flags & TCPCB_FLAG_SYN)) {
                        acked |= FLAG_DATA_ACKED;
+                        ++pkts_acked;
                } else {
                        acked |= FLAG_SYN_ACKED;
                        tp->retrans_stamp = 0;
@@ -2461,6 +2068,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
                                seq_rtt = -1;
                        } else if (seq_rtt < 0)
                                seq_rtt = now - scb->when;
+                        if (seq_usrtt)
+                                *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
+                                        + (usnow.tv_usec - skb->stamp.tv_usec);
                        if (sacked & TCPCB_SACKED_ACKED)
                                tp->sacked_out -= tcp_skb_pcount(skb);
                        if (sacked & TCPCB_LOST)
@@ -2479,8 +2090,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
        }
        if (acked&FLAG_ACKED) {
-                tcp_ack_update_rtt(tp, acked, seq_rtt);
+                tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
                tcp_ack_packets_out(sk, tp);
+                if (tp->ca_ops->pkts_acked)
+                        tp->ca_ops->pkts_acked(tp, pkts_acked);
        }
 #if FASTRETRANS_DEBUG > 0
@@ -2624,257 +2238,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
        tp->frto_counter = (tp->frto_counter + 1) % 3;
 }
-/*
- * TCP Westwood+
- */
-/*
- * @init_westwood
- * This function initializes fields used in TCP Westwood+. We can't
- * get no information about RTTmin at this time so we simply set it to
- * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
- * since in this way we're sure it will be updated in a consistent
- * way as soon as possible. It will reasonably happen within the first
- * RTT period of the connection lifetime.
- */
-static void init_westwood(struct sock *sk)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        tp->westwood.bw_ns_est = 0;
-        tp->westwood.bw_est = 0;
-        tp->westwood.accounted = 0;
-        tp->westwood.cumul_ack = 0;
-        tp->westwood.rtt_win_sx = tcp_time_stamp;
-        tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
-        tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
-        tp->westwood.snd_una = tp->snd_una;
-}
-/*
- * @westwood_do_filter
- * Low-pass filter. Implemented using constant coeffients.
- */
-static inline __u32 westwood_do_filter(__u32 a, __u32 b)
-{
-        return (((7 * a) + b) >> 3);
-}
-static void westwood_filter(struct sock *sk, __u32 delta)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        tp->westwood.bw_ns_est =
-                westwood_do_filter(tp->westwood.bw_ns_est, 
-                                   tp->westwood.bk / delta);
-        tp->westwood.bw_est =
-                westwood_do_filter(tp->westwood.bw_est,
-                                   tp->westwood.bw_ns_est);
-}
-/* 
- * @westwood_update_rttmin
- * It is used to update RTTmin. In this case we MUST NOT use
- * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
- */
-static inline __u32 westwood_update_rttmin(const struct sock *sk)
-{
-        const struct tcp_sock *tp = tcp_sk(sk);
-        __u32 rttmin = tp->westwood.rtt_min;
-        if (tp->westwood.rtt != 0 &&
-            (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
-                rttmin = tp->westwood.rtt;
-        return rttmin;
-}
-/*
- * @westwood_acked
- * Evaluate increases for dk. 
- */
-static inline __u32 westwood_acked(const struct sock *sk)
-{
-        const struct tcp_sock *tp = tcp_sk(sk);
-        return tp->snd_una - tp->westwood.snd_una;
-}
-/*
- * @westwood_new_window
- * It evaluates if we are receiving data inside the same RTT window as
- * when we started.
- * Return value:
- * It returns 0 if we are still evaluating samples in the same RTT
- * window, 1 if the sample has to be considered in the next window.
- */
-static int westwood_new_window(const struct sock *sk)
-{
-        const struct tcp_sock *tp = tcp_sk(sk);
-        __u32 left_bound;
-        __u32 rtt;
-        int ret = 0;
-        left_bound = tp->westwood.rtt_win_sx;
-        rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
-        /*
-         * A RTT-window has passed. Be careful since if RTT is less than
-         * 50ms we don't filter but we continue 'building the sample'.
-         * This minimum limit was choosen since an estimation on small
-         * time intervals is better to avoid...
-         * Obvioulsy on a LAN we reasonably will always have
-         * right_bound = left_bound + WESTWOOD_RTT_MIN
-         */
-        if ((left_bound + rtt) < tcp_time_stamp)
-                ret = 1;
-        return ret;
-}
-/*
- * @westwood_update_window
- * It updates RTT evaluation window if it is the right moment to do
- * it. If so it calls filter for evaluating bandwidth. 
- */
-static void __westwood_update_window(struct sock *sk, __u32 now)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        __u32 delta = now - tp->westwood.rtt_win_sx;
-        if (delta) {
-                if (tp->westwood.rtt)
-                        westwood_filter(sk, delta);
-                tp->westwood.bk = 0;
-                tp->westwood.rtt_win_sx = tcp_time_stamp;
-        }
-}
-static void westwood_update_window(struct sock *sk, __u32 now)
-{
-        if (westwood_new_window(sk)) 
-                __westwood_update_window(sk, now);
-}
-/*
- * @__tcp_westwood_fast_bw
- * It is called when we are in fast path. In particular it is called when
- * header prediction is successfull. In such case infact update is
- * straight forward and doesn't need any particular care.
- */
-static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        westwood_update_window(sk, tcp_time_stamp);
-        tp->westwood.bk += westwood_acked(sk);
-        tp->westwood.snd_una = tp->snd_una;
-        tp->westwood.rtt_min = westwood_update_rttmin(sk);
-}
-static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
-{
-        if (tcp_is_westwood(tcp_sk(sk)))
-                __tcp_westwood_fast_bw(sk, skb);
-}
-/*
- * @westwood_dupack_update
- * It updates accounted and cumul_ack when receiving a dupack.
- */
-static void westwood_dupack_update(struct sock *sk)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        tp->westwood.accounted += tp->mss_cache_std;
-        tp->westwood.cumul_ack = tp->mss_cache_std;
-}
-static inline int westwood_may_change_cumul(struct tcp_sock *tp)
-{
-        return (tp->westwood.cumul_ack > tp->mss_cache_std);
-}
-static inline void westwood_partial_update(struct tcp_sock *tp)
-{
-        tp->westwood.accounted -= tp->westwood.cumul_ack;
-        tp->westwood.cumul_ack = tp->mss_cache_std;
-}
-static inline void westwood_complete_update(struct tcp_sock *tp)
-{
-        tp->westwood.cumul_ack -= tp->westwood.accounted;
-        tp->westwood.accounted = 0;
-}
-/*
- * @westwood_acked_count
- * This function evaluates cumul_ack for evaluating dk in case of
- * delayed or partial acks.
- */
-static inline __u32 westwood_acked_count(struct sock *sk)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        tp->westwood.cumul_ack = westwood_acked(sk);
-        /* If cumul_ack is 0 this is a dupack since it's not moving
-         * tp->snd_una.
-         */
-        if (!(tp->westwood.cumul_ack))
-                westwood_dupack_update(sk);
-        if (westwood_may_change_cumul(tp)) {
-                /* Partial or delayed ack */
-                if (tp->westwood.accounted >= tp->westwood.cumul_ack)
-                        westwood_partial_update(tp);
-                else
-                        westwood_complete_update(tp);
-        }
-        tp->westwood.snd_una = tp->snd_una;
-        return tp->westwood.cumul_ack;
-}
-/*
- * @__tcp_westwood_slow_bw
- * It is called when something is going wrong..even if there could
- * be no problems! Infact a simple delayed packet may trigger a
- * dupack. But we need to be careful in such case.
- */
-static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        westwood_update_window(sk, tcp_time_stamp);
-        tp->westwood.bk += westwood_acked_count(sk);
-        tp->westwood.rtt_min = westwood_update_rttmin(sk);
-}
-static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
-{
-        if (tcp_is_westwood(tcp_sk(sk)))
-                __tcp_westwood_slow_bw(sk, skb);
-}
 /* This routine deals with incoming acks, but not outgoing ones. */
 static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 {
@@ -2884,6 +2247,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
        u32 ack = TCP_SKB_CB(skb)->ack_seq;
        u32 prior_in_flight;
        s32 seq_rtt;
+        s32 seq_usrtt = 0;
        int prior_packets;
        /* If the ack is newer than sent or older than previous acks
@@ -2902,9 +2266,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
                 */
                tcp_update_wl(tp, ack, ack_seq);
                tp->snd_una = ack;
-                tcp_westwood_fast_bw(sk, skb);
                flag |= FLAG_WIN_UPDATE;
+                tcp_ca_event(tp, CA_EVENT_FAST_ACK);
                NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
        } else {
                if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -2920,7 +2285,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
                if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
                        flag |= FLAG_ECE;
-                tcp_westwood_slow_bw(sk,skb);
+                tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
        }
        /* We passed data and got it acked, remove any soft error
@@ -2935,22 +2300,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
        prior_in_flight = tcp_packets_in_flight(tp);
        /* See if we can take anything off of the retransmit queue. */
-        flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
+        flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
+                                    tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
        if (tp->frto_counter)
                tcp_process_frto(sk, prior_snd_una);
        if (tcp_ack_is_dubious(tp, flag)) {
                /* Advanve CWND, if state allows this. */
-                if ((flag & FLAG_DATA_ACKED) &&
+                if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
-                    (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) &&
+                        tcp_cong_avoid(tp, ack,  seq_rtt, prior_in_flight, 0);
-                    tcp_may_raise_cwnd(tp, flag))
-                        tcp_cong_avoid(tp, ack, seq_rtt);
                tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
        } else {
-                if ((flag & FLAG_DATA_ACKED) && 
+                if ((flag & FLAG_DATA_ACKED))
-                    (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd))
+                        tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
-                        tcp_cong_avoid(tp, ack, seq_rtt);
        }
        if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -3439,7 +2802,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
        int this_sack;
        /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
-        if (skb_queue_len(&tp->out_of_order_queue) == 0) {
+        if (skb_queue_empty(&tp->out_of_order_queue)) {
                tp->rx_opt.num_sacks = 0;
                tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
                return;
@@ -3572,13 +2935,13 @@ queue_and_out:
                if(th->fin)
                        tcp_fin(skb, sk, th);
-                if (skb_queue_len(&tp->out_of_order_queue)) {
+                if (!skb_queue_empty(&tp->out_of_order_queue)) {
                        tcp_ofo_queue(sk);
                        /* RFC2581. 4.2. SHOULD send immediate ACK, when
                         * gap in queue is filled.
                         */
-                        if (!skb_queue_len(&tp->out_of_order_queue))
+                        if (skb_queue_empty(&tp->out_of_order_queue))
                                tp->ack.pingpong = 0;
                }
@@ -3886,9 +3249,8 @@ static int tcp_prune_queue(struct sock *sk)
         * This must not ever occur. */
        /* First, purge the out_of_order queue. */
-        if (skb_queue_len(&tp->out_of_order_queue)) {
+        if (!skb_queue_empty(&tp->out_of_order_queue)) {
-                NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED, 
+                NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
-                                 skb_queue_len(&tp->out_of_order_queue));
                __skb_queue_purge(&tp->out_of_order_queue);
                /* Reset SACK state.  A conforming SACK implementation will
@@ -3937,6 +3299,28 @@ void tcp_cwnd_application_limited(struct sock *sk)
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
+static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp)
+{
+        /* If the user specified a specific send buffer setting, do
+         * not modify it.
+         */
+        if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
+                return 0;
+        /* If we are under global TCP memory pressure, do not expand.  */
+        if (tcp_memory_pressure)
+                return 0;
+        /* If we are under soft global TCP memory pressure, do not expand.  */
+        if (atomic_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
+                return 0;
+        /* If we filled the congestion window, do not expand.  */
+        if (tp->packets_out >= tp->snd_cwnd)
+                return 0;
+        return 1;
+}
 /* When incoming ACK allowed to free some skb from write_queue,
 * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
@@ -3948,11 +3332,8 @@ static void tcp_new_space(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        if (tp->packets_out < tp->snd_cwnd &&
+        if (tcp_should_expand_sndbuf(sk, tp)) {
-            !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
+                int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
-            !tcp_memory_pressure &&
-            atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
-                int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
                        MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
                    demanded = max_t(unsigned int, tp->snd_cwnd,
                                                   tp->reordering + 1);
@@ -3975,22 +3356,9 @@ static inline void tcp_check_space(struct sock *sk)
        }
 }
-static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
+static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp)
 {
-        struct tcp_sock *tp = tcp_sk(sk);
+        tcp_push_pending_frames(sk, tp);
-        if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
-            tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
-            tcp_write_xmit(sk, tp->nonagle))
-                tcp_check_probe_timer(sk, tp);
-}
-static __inline__ void tcp_data_snd_check(struct sock *sk)
-{
-        struct sk_buff *skb = sk->sk_send_head;
-        if (skb != NULL)
-                __tcp_data_snd_check(sk, skb);
        tcp_check_space(sk);
 }
@@ -4284,7 +3652,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                 */
                                tcp_ack(sk, skb, 0);
                                __kfree_skb(skb); 
-                                tcp_data_snd_check(sk);
+                                tcp_data_snd_check(sk, tp);
                                return 0;
                        } else { /* Header too small */
                                TCP_INC_STATS_BH(TCP_MIB_INERRS);
@@ -4350,7 +3718,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                        if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
                                /* Well, only one small jumplet in fast path... */
                                tcp_ack(sk, skb, FLAG_DATA);
-                                tcp_data_snd_check(sk);
+                                tcp_data_snd_check(sk, tp);
                                if (!tcp_ack_scheduled(tp))
                                        goto no_ack;
                        }
@@ -4428,7 +3796,7 @@ step5:
        /* step 7: process the segment text */
        tcp_data_queue(sk, skb);
-        tcp_data_snd_check(sk);
+        tcp_data_snd_check(sk, tp);
        tcp_ack_snd_check(sk);
        return 0;
@@ -4552,6 +3920,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                tcp_init_metrics(sk);
+                tcp_init_congestion_control(tp);
                /* Prevent spurious tcp_cwnd_restart() on first data
                 * packet.
                 */
@@ -4708,9 +4078,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                        if(tp->af_specific->conn_request(sk, skb) < 0)
                                return 1;
-                        init_westwood(sk);
-                        init_bictcp(tp);
                        /* Now we have several options: In theory there is 
                         * nothing else in the frame. KA9Q has an option to 
                         * send data with the syn, BSD accepts data with the
@@ -4732,9 +4099,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                goto discard;
        case TCP_SYN_SENT:
-                init_westwood(sk);
-                init_bictcp(tp);
                queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
                if (queued >= 0)
                        return queued;
@@ -4742,7 +4106,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                /* Do step6 onward by hand. */
                tcp_urg(sk, skb, th);
                __kfree_skb(skb);
-                tcp_data_snd_check(sk);
+                tcp_data_snd_check(sk, tp);
                return 0;
        }
@@ -4816,7 +4180,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                 */
                                if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
                                    !tp->srtt)
-                                        tcp_ack_saw_tstamp(tp, 0);
+                                        tcp_ack_saw_tstamp(tp, 0, 0);
                                if (tp->rx_opt.tstamp_ok)
                                        tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4828,6 +4192,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                                tcp_init_metrics(sk);
+                                tcp_init_congestion_control(tp);
                                /* Prevent spurious tcp_cwnd_restart() on
                                 * first data packet.
                                 */
@@ -4931,7 +4297,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
        /* tcp_data could move socket to TIME-WAIT */
        if (sk->sk_state != TCP_CLOSE) {
-                tcp_data_snd_check(sk);
+                tcp_data_snd_check(sk, tp);
                tcp_ack_snd_check(sk);
        }
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2d41d5d6ad19..62f62bb05c2a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2045,9 +2045,10 @@ static int tcp_v4_init_sock(struct sock *sk)
         */
        tp->snd_ssthresh = 0x7fffffff;  /* Infinity */
        tp->snd_cwnd_clamp = ~0;
-        tp->mss_cache_std = tp->mss_cache = 536;
+        tp->mss_cache = 536;
        tp->reordering = sysctl_tcp_reordering;
+        tp->ca_ops = &tcp_init_congestion_ops;
        sk->sk_state = TCP_CLOSE;
@@ -2070,6 +2071,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
        tcp_clear_xmit_timers(sk);
+        tcp_cleanup_congestion_control(tp);
        /* Cleanup up the write buffer. */
        sk_stream_writequeue_purge(sk);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b3943e7562f3..f42a284164b7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -774,6 +774,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                newtp->frto_counter = 0;
                newtp->frto_highmark = 0;
+                newtp->ca_ops = &tcp_reno;
                tcp_set_ca_state(newtp, TCP_CA_Open);
                tcp_init_xmit_timers(newsk);
                skb_queue_head_init(&newtp->out_of_order_queue);
@@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
                if (newtp->ecn_flags&TCP_ECN_OK)
                        sock_set_flag(newsk, SOCK_NO_LARGESEND);
-                tcp_ca_init(newtp);
                TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
        }
        return newsk;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f17c6577e337..e3f8ea1bfa9c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
 * will allow a single TSO frame to consume.  Building TSO frames
 * which are too large can cause TCP streams to be bursty.
 */
-int sysctl_tcp_tso_win_divisor = 8;
+int sysctl_tcp_tso_win_divisor = 3;
 static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
                                    struct sk_buff *skb)
@@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
        u32 restart_cwnd = tcp_init_cwnd(tp, dst);
        u32 cwnd = tp->snd_cwnd;
-        if (tcp_is_vegas(tp)) 
+        tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
-                tcp_vegas_enable(tp);
        tp->snd_ssthresh = tcp_current_ssthresh(tp);
        restart_cwnd = min(restart_cwnd, cwnd);
@@ -141,11 +140,11 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp,
                tp->ack.pingpong = 1;
 }
-static __inline__ void tcp_event_ack_sent(struct sock *sk)
+static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        tcp_dec_quickack_mode(tp);
+        tcp_dec_quickack_mode(tp, pkts);
        tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
 }
@@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 #define SYSCTL_FLAG_WSCALE      0x2
 #define SYSCTL_FLAG_SACK        0x4
+                /* If congestion control is doing timestamping */
+                if (tp->ca_ops->rtt_sample)
+                        do_gettimeofday(&skb->stamp);
                sysctl_flags = 0;
                if (tcb->flags & TCPCB_FLAG_SYN) {
                        tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
@@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
                                            (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
                }
                
-                /*
+                if (tcp_packets_in_flight(tp) == 0)
-                 * If the connection is idle and we are restarting,
+                        tcp_ca_event(tp, CA_EVENT_TX_START);
-                 * then we don't want to do any Vegas calculations
-                 * until we get fresh RTT samples.  So when we
-                 * restart, we reset our Vegas state to a clean
-                 * slate. After we get acks for this flight of
-                 * packets, _then_ we can make Vegas calculations
-                 * again.
-                 */
-                if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
-                        tcp_vegas_enable(tp);
                th = (struct tcphdr *) skb_push(skb, tcp_header_size);
                skb->h.th = th;
@@ -361,7 +355,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
                tp->af_specific->send_check(sk, th, skb->len, skb);
                if (tcb->flags & TCPCB_FLAG_ACK)
-                        tcp_event_ack_sent(sk);
+                        tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
                if (skb->len != tcp_header_size)
                        tcp_event_data_sent(tp, skb, sk);
@@ -409,42 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
                sk->sk_send_head = skb;
 }
-static inline void tcp_tso_set_push(struct sk_buff *skb)
+static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
-{
-        /* Force push to be on for any TSO frames to workaround
-         * problems with busted implementations like Mac OS-X that
-         * hold off socket receive wakeups until push is seen.
-         */
-        if (tcp_skb_pcount(skb) > 1)
-                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
-}
-/* Send _single_ skb sitting at the send head. This function requires
- * true push pending frames to setup probe timer etc.
- */
-void tcp_push_one(struct sock *sk, unsigned cur_mss)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        struct sk_buff *skb = sk->sk_send_head;
-        if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
+        if (skb->len <= tp->mss_cache ||
-                /* Send it out now. */
-                TCP_SKB_CB(skb)->when = tcp_time_stamp;
-                tcp_tso_set_push(skb);
-                if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
-                        sk->sk_send_head = NULL;
-                        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
-                        tcp_packets_out_inc(sk, tp, skb);
-                        return;
-                }
-        }
-}
-void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-        if (skb->len <= tp->mss_cache_std ||
            !(sk->sk_route_caps & NETIF_F_TSO)) {
                /* Avoid the costly divide in the normal
                 * non-TSO case.
@@ -454,10 +417,10 @@ void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
        } else {
                unsigned int factor;
-                factor = skb->len + (tp->mss_cache_std - 1);
+                factor = skb->len + (tp->mss_cache - 1);
-                factor /= tp->mss_cache_std;
+                factor /= tp->mss_cache;
                skb_shinfo(skb)->tso_segs = factor;
-                skb_shinfo(skb)->tso_size = tp->mss_cache_std;
+                skb_shinfo(skb)->tso_size = tp->mss_cache;
        }
 }
@@ -521,6 +484,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
         * skbs, which it never sent before. --ANK
         */
        TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
+        buff->stamp = skb->stamp;
        if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
                tp->lost_out -= tcp_skb_pcount(skb);
@@ -542,6 +506,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
        }
        /* Link BUFF into the send queue. */
+        skb_header_release(buff);
        __skb_append(skb, buff);
        return 0;
@@ -662,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
        /* And store cached results */
        tp->pmtu_cookie = pmtu;
-        tp->mss_cache = tp->mss_cache_std = mss_now;
+        tp->mss_cache = mss_now;
        return mss_now;
 }
@@ -674,57 +639,316 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 * cannot be large. However, taking into account rare use of URG, this
 * is not a big flaw.
 */
+unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
-unsigned int tcp_current_mss(struct sock *sk, int large)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct dst_entry *dst = __sk_dst_get(sk);
-        unsigned int do_large, mss_now;
+        u32 mss_now;
+        u16 xmit_size_goal;
+        int doing_tso = 0;
+        mss_now = tp->mss_cache;
+        if (large_allowed &&
+            (sk->sk_route_caps & NETIF_F_TSO) &&
+            !tp->urg_mode)
+                doing_tso = 1;
-        mss_now = tp->mss_cache_std;
        if (dst) {
                u32 mtu = dst_mtu(dst);
                if (mtu != tp->pmtu_cookie)
                        mss_now = tcp_sync_mss(sk, mtu);
        }
-        do_large = (large &&
+        if (tp->rx_opt.eff_sacks)
-                    (sk->sk_route_caps & NETIF_F_TSO) &&
+                mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
-                    !tp->urg_mode);
+                            (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
-        if (do_large) {
+        xmit_size_goal = mss_now;
-                unsigned int large_mss, factor, limit;
-                large_mss = 65535 - tp->af_specific->net_header_len -
+        if (doing_tso) {
+                xmit_size_goal = 65535 -
+                        tp->af_specific->net_header_len -
                        tp->ext_header_len - tp->tcp_header_len;
-                if (tp->max_window && large_mss > (tp->max_window>>1))
+                if (tp->max_window &&
-                        large_mss = max((tp->max_window>>1),
+                    (xmit_size_goal > (tp->max_window >> 1)))
-                                        68U - tp->tcp_header_len);
+                        xmit_size_goal = max((tp->max_window >> 1),
+                                             68U - tp->tcp_header_len);
-                factor = large_mss / mss_now;
+                xmit_size_goal -= (xmit_size_goal % mss_now);
+        }
+        tp->xmit_size_goal = xmit_size_goal;
-                /* Always keep large mss multiple of real mss, but
+        return mss_now;
-                 * do not exceed 1/tso_win_divisor of the congestion window
+}
-                 * so we can keep the ACK clock ticking and minimize
-                 * bursting.
+/* Congestion window validation. (RFC2861) */
-                 */
-                limit = tp->snd_cwnd;
+static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
-                if (sysctl_tcp_tso_win_divisor)
+{
-                        limit /= sysctl_tcp_tso_win_divisor;
+        __u32 packets_out = tp->packets_out;
-                limit = max(1U, limit);
-                if (factor > limit)
-                        factor = limit;
-                tp->mss_cache = mss_now * factor;
+        if (packets_out >= tp->snd_cwnd) {
+                /* Network is feed fully. */
+                tp->snd_cwnd_used = 0;
+                tp->snd_cwnd_stamp = tcp_time_stamp;
+        } else {
+                /* Network starves. */
+                if (tp->packets_out > tp->snd_cwnd_used)
+                        tp->snd_cwnd_used = tp->packets_out;
-                mss_now = tp->mss_cache;
+                if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
+                        tcp_cwnd_application_limited(sk);
        }
+}
-        if (tp->rx_opt.eff_sacks)
+static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
-                mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
+{
-                            (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
+        u32 window, cwnd_len;
-        return mss_now;
+        window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
+        cwnd_len = mss_now * cwnd;
+        return min(window, cwnd_len);
+}
+/* Can at least one segment of SKB be sent right now, according to the
+ * congestion window rules?  If so, return how many segments are allowed.
+ */
+static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
+{
+        u32 in_flight, cwnd;
+        /* Don't be strict about the congestion window for the final FIN.  */
+        if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+                return 1;
+        in_flight = tcp_packets_in_flight(tp);
+        cwnd = tp->snd_cwnd;
+        if (in_flight < cwnd)
+                return (cwnd - in_flight);
+        return 0;
+}
+/* This must be invoked the first time we consider transmitting
+ * SKB onto the wire.
+ */
+static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
+{
+        int tso_segs = tcp_skb_pcount(skb);
+        if (!tso_segs) {
+                tcp_set_skb_tso_segs(sk, skb);
+                tso_segs = tcp_skb_pcount(skb);
+        }
+        return tso_segs;
+}
+static inline int tcp_minshall_check(const struct tcp_sock *tp)
+{
+        return after(tp->snd_sml,tp->snd_una) &&
+                !after(tp->snd_sml, tp->snd_nxt);
+}
+/* Return 0, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized.
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_NODELAY was set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ *    With Minshall's modification: all sent small packets are ACKed.
+ */
+static inline int tcp_nagle_check(const struct tcp_sock *tp,
+                                  const struct sk_buff *skb, 
+                                  unsigned mss_now, int nonagle)
+{
+        return (skb->len < mss_now &&
+                ((nonagle&TCP_NAGLE_CORK) ||
+                 (!nonagle &&
+                  tp->packets_out &&
+                  tcp_minshall_check(tp))));
+}
+/* Return non-zero if the Nagle test allows this packet to be
+ * sent now.
+ */
+static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
+                                 unsigned int cur_mss, int nonagle)
+{
+        /* Nagle rule does not apply to frames, which sit in the middle of the
+         * write_queue (they have no chances to get new data).
+         *
+         * This is implemented in the callers, where they modify the 'nonagle'
+         * argument based upon the location of SKB in the send queue.
+         */
+        if (nonagle & TCP_NAGLE_PUSH)
+                return 1;
+        /* Don't use the nagle rule for urgent data (or for the final FIN).  */
+        if (tp->urg_mode ||
+            (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
+                return 1;
+        if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
+                return 1;
+        return 0;
+}
+/* Does at least the first segment of SKB fit into the send window? */
+static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
+{
+        u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+        if (skb->len > cur_mss)
+                end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
+        return !after(end_seq, tp->snd_una + tp->snd_wnd);
+}
+/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
+ * should be put on the wire right now.  If so, it returns the number of
+ * packets allowed by the congestion window.
+ */
+static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
+                                 unsigned int cur_mss, int nonagle)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        unsigned int cwnd_quota;
+        tcp_init_tso_segs(sk, skb);
+        if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
+                return 0;
+        cwnd_quota = tcp_cwnd_test(tp, skb);
+        if (cwnd_quota &&
+            !tcp_snd_wnd_test(tp, skb, cur_mss))
+                cwnd_quota = 0;
+        return cwnd_quota;
+}
+static inline int tcp_skb_is_last(const struct sock *sk, 
+                                  const struct sk_buff *skb)
+{
+        return skb->next == (struct sk_buff *)&sk->sk_write_queue;
+}
+int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
+{
+        struct sk_buff *skb = sk->sk_send_head;
+        return (skb &&
+                tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
+                             (tcp_skb_is_last(sk, skb) ?
+                              TCP_NAGLE_PUSH :
+                              tp->nonagle)));
+}
+/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
+ * which is put after SKB on the list.  It is very much like
+ * tcp_fragment() except that it may make several kinds of assumptions
+ * in order to speed up the splitting operation.  In particular, we
+ * know that all the data is in scatter-gather pages, and that the
+ * packet has never been sent out before (and thus is not cloned).
+ */
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
+{
+        struct sk_buff *buff;
+        int nlen = skb->len - len;
+        u16 flags;
+        /* All of a TSO frame must be composed of paged data.  */
+        BUG_ON(skb->len != skb->data_len);
+        buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
+        if (unlikely(buff == NULL))
+                return -ENOMEM;
+        buff->truesize = nlen;
+        skb->truesize -= nlen;
+        /* Correct the sequence numbers. */
+        TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+        TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+        /* PSH and FIN should only be set in the second packet. */
+        flags = TCP_SKB_CB(skb)->flags;
+        TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+        TCP_SKB_CB(buff)->flags = flags;
+        /* This packet was never sent out yet, so no SACK bits. */
+        TCP_SKB_CB(buff)->sacked = 0;
+        buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
+        skb_split(skb, buff, len);
+        /* Fix up tso_factor for both original and new SKB.  */
+        tcp_set_skb_tso_segs(sk, skb);
+        tcp_set_skb_tso_segs(sk, buff);
+        /* Link BUFF into the send queue. */
+        skb_header_release(buff);
+        __skb_append(skb, buff);
+        return 0;
+}
+/* Try to defer sending, if possible, in order to minimize the amount
+ * of TSO splitting we do.  View it as a kind of TSO Nagle test.
+ *
+ * This algorithm is from John Heffner.
+ */
+static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
+{
+        u32 send_win, cong_win, limit, in_flight;
+        if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+                return 0;
+        if (tp->ca_state != TCP_CA_Open)
+                return 0;
+        in_flight = tcp_packets_in_flight(tp);
+        BUG_ON(tcp_skb_pcount(skb) <= 1 ||
+               (tp->snd_cwnd <= in_flight));
+        send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
+        /* From in_flight test above, we know that cwnd > in_flight.  */
+        cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+        limit = min(send_win, cong_win);
+        /* If sk_send_head can be sent fully now, just do it.  */
+        if (skb->len <= limit)
+                return 0;
+        if (sysctl_tcp_tso_win_divisor) {
+                u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
+                /* If at least some fraction of a window is available,
+                 * just use it.
+                 */
+                chunk /= sysctl_tcp_tso_win_divisor;
+                if (limit >= chunk)
+                        return 0;
+        } else {
+                /* Different approach, try not to defer past a single
+                 * ACK.  Receiver should ACK every other full sized
+                 * frame, so if we have space for more than 3 frames
+                 * then send now.
+                 */
+                if (limit > tcp_max_burst(tp) * tp->mss_cache)
+                        return 0;
+        }
+        /* Ok, it looks like it is advisable to defer.  */
+        return 1;
 }
 /* This routine writes packets to the network.  It advances the
@@ -734,57 +958,158 @@ unsigned int tcp_current_mss(struct sock *sk, int large)
 * Returns 1, if no segments are in flight and we have queued segments, but
 * cannot send anything now because of SWS or another problem.
 */
-int tcp_write_xmit(struct sock *sk, int nonagle)
+static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-        unsigned int mss_now;
+        struct sk_buff *skb;
+        unsigned int tso_segs, sent_pkts;
+        int cwnd_quota;
        /* If we are closed, the bytes will have to remain here.
         * In time closedown will finish, we empty the write queue and all
         * will be happy.
         */
-        if (sk->sk_state != TCP_CLOSE) {
+        if (unlikely(sk->sk_state == TCP_CLOSE))
-                struct sk_buff *skb;
+                return 0;
-                int sent_pkts = 0;
+        skb = sk->sk_send_head;
+        if (unlikely(!skb))
+                return 0;
+        tso_segs = tcp_init_tso_segs(sk, skb);
+        cwnd_quota = tcp_cwnd_test(tp, skb);
+        if (unlikely(!cwnd_quota))
+                goto out;
+        sent_pkts = 0;
+        while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
+                BUG_ON(!tso_segs);
+                if (tso_segs == 1) {
+                        if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
+                                                     (tcp_skb_is_last(sk, skb) ?
+                                                      nonagle : TCP_NAGLE_PUSH))))
+                                break;
+                } else {
+                        if (tcp_tso_should_defer(sk, tp, skb))
+                                break;
+                }
-                /* Account for SACKS, we may need to fragment due to this.
+                if (tso_segs > 1) {
-                 * It is just like the real MSS changing on us midstream.
+                        u32 limit = tcp_window_allows(tp, skb,
-                 * We also handle things correctly when the user adds some
+                                                      mss_now, cwnd_quota);
-                 * IP options mid-stream.  Silly to do, but cover it.
-                 */
+                        if (skb->len < limit) {
-                mss_now = tcp_current_mss(sk, 1);
+                                unsigned int trim = skb->len % mss_now;
-                while ((skb = sk->sk_send_head) &&
+                                if (trim)
-                       tcp_snd_test(sk, skb, mss_now,
+                                        limit = skb->len - trim;
-                                    tcp_skb_is_last(sk, skb) ? nonagle :
+                        }
-                                                               TCP_NAGLE_PUSH)) {
+                        if (skb->len > limit) {
-                        if (skb->len > mss_now) {
+                                if (tso_fragment(sk, skb, limit))
-                                if (tcp_fragment(sk, skb, mss_now))
                                        break;
                        }
+                } else if (unlikely(skb->len > mss_now)) {
-                        TCP_SKB_CB(skb)->when = tcp_time_stamp;
+                        if (unlikely(tcp_fragment(sk, skb,  mss_now)))
-                        tcp_tso_set_push(skb);
-                        if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
                                break;
+                }
-                        /* Advance the send_head.  This one is sent out.
+                TCP_SKB_CB(skb)->when = tcp_time_stamp;
-                         * This call will increment packets_out.
-                         */
+                if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
-                        update_send_head(sk, tp, skb);
+                        break;
+                /* Advance the send_head.  This one is sent out.
+                 * This call will increment packets_out.
+                 */
+                update_send_head(sk, tp, skb);
+                tcp_minshall_update(tp, mss_now, skb);
+                sent_pkts++;
+                /* Do not optimize this to use tso_segs. If we chopped up
+                 * the packet above, tso_segs will no longer be valid.
+                 */
+                cwnd_quota -= tcp_skb_pcount(skb);
+                BUG_ON(cwnd_quota < 0);
+                if (!cwnd_quota)
+                        break;
+                skb = sk->sk_send_head;
+                if (!skb)
+                        break;
+                tso_segs = tcp_init_tso_segs(sk, skb);
+        }
+        if (likely(sent_pkts)) {
+                tcp_cwnd_validate(sk, tp);
+                return 0;
+        }
+out:
+        return !tp->packets_out && sk->sk_send_head;
+}
+/* Push out any pending frames which were held back due to
+ * TCP_CORK or attempt at coalescing tiny packets.
+ * The socket must be locked by the caller.
+ */
+void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
+                               unsigned int cur_mss, int nonagle)
+{
+        struct sk_buff *skb = sk->sk_send_head;
+        if (skb) {
+                if (tcp_write_xmit(sk, cur_mss, nonagle))
+                        tcp_check_probe_timer(sk, tp);
+        }
+}
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned int mss_now)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+        struct sk_buff *skb = sk->sk_send_head;
+        unsigned int tso_segs, cwnd_quota;
+        BUG_ON(!skb || skb->len < mss_now);
+        tso_segs = tcp_init_tso_segs(sk, skb);
+        cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
+        if (likely(cwnd_quota)) {
+                BUG_ON(!tso_segs);
-                        tcp_minshall_update(tp, mss_now, skb);
+                if (tso_segs > 1) {
-                        sent_pkts = 1;
+                        u32 limit = tcp_window_allows(tp, skb,
+                                                      mss_now, cwnd_quota);
+                        if (skb->len < limit) {
+                                unsigned int trim = skb->len % mss_now;
+                                if (trim)
+                                        limit = skb->len - trim;
+                        }
+                        if (skb->len > limit) {
+                                if (unlikely(tso_fragment(sk, skb, limit)))
+                                        return;
+                        }
+                } else if (unlikely(skb->len > mss_now)) {
+                        if (unlikely(tcp_fragment(sk, skb, mss_now)))
+                                return;
                }
-                if (sent_pkts) {
+                /* Send it out now. */
+                TCP_SKB_CB(skb)->when = tcp_time_stamp;
+                if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
+                        update_send_head(sk, tp, skb);
                        tcp_cwnd_validate(sk, tp);
-                        return 0;
+                        return;
                }
-                return !tp->packets_out && sk->sk_send_head;
        }
-        return 0;
 }
 /* This function returns the amount that we can raise the
@@ -1044,7 +1369,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
                if (sk->sk_route_caps & NETIF_F_TSO) {
                        sk->sk_route_caps &= ~NETIF_F_TSO;
                        sock_set_flag(sk, SOCK_NO_LARGESEND);
-                        tp->mss_cache = tp->mss_cache_std;
                }
                if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1106,7 +1430,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
         * is still in somebody's hands, else make a clone.
         */
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-        tcp_tso_set_push(skb);
        err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
                                    pskb_copy(skb, GFP_ATOMIC):
@@ -1290,7 +1613,7 @@ void tcp_send_fin(struct sock *sk)
 * was unread data in the receive queue.  This behavior is recommended
 * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
 */
-void tcp_send_active_reset(struct sock *sk, int priority)
+void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
@@ -1449,7 +1772,6 @@ static inline void tcp_connect_init(struct sock *sk)
                tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
        tp->advmss = dst_metric(dst, RTAX_ADVMSS);
        tcp_initialize_rcv_mss(sk);
-        tcp_ca_init(tp);
        tcp_select_initial_window(tcp_full_space(sk),
                                  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
@@ -1503,7 +1825,6 @@ int tcp_connect(struct sock *sk)
        TCP_SKB_CB(buff)->end_seq = tp->write_seq;
        tp->snd_nxt = tp->write_seq;
        tp->pushed_seq = tp->write_seq;
-        tcp_ca_init(tp);
        /* Send it off. */
        TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -1677,14 +1998,12 @@ int tcp_write_wakeup(struct sock *sk)
                                if (sk->sk_route_caps & NETIF_F_TSO) {
                                        sock_set_flag(sk, SOCK_NO_LARGESEND);
                                        sk->sk_route_caps &= ~NETIF_F_TSO;
-                                        tp->mss_cache = tp->mss_cache_std;
                                }
                        } else if (!tcp_skb_pcount(skb))
                                tcp_set_skb_tso_segs(sk, skb);
                        TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
                        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-                        tcp_tso_set_push(skb);
                        err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
                        if (!err) {
                                update_send_head(sk, tp, skb);
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
new file mode 100644
index 000000000000..70e108e15c71
--- /dev/null
+++ b/net/ipv4/tcp_scalable.c
@@ -0,0 +1,68 @@
+/* Tom Kelly's Scalable TCP
+ *
+ * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/
+ *
+ * John Heffner <jheffner@sc.edu>
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/tcp.h>
+/* These factors derived from the recommended values in the aer:
+ * .01 and and 7/8. We use 50 instead of 100 to account for
+ * delayed ack.
+ */
+#define TCP_SCALABLE_AI_CNT     50U
+#define TCP_SCALABLE_MD_SCALE   3
+static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+                                    u32 in_flight, int flag)
+{
+        if (in_flight < tp->snd_cwnd)
+                return;
+        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+                tp->snd_cwnd++;
+        } else {
+                tp->snd_cwnd_cnt++;
+                if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
+                        tp->snd_cwnd++;
+                        tp->snd_cwnd_cnt = 0;
+                }
+        }
+        tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+        tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+static u32 tcp_scalable_ssthresh(struct tcp_sock *tp)
+{
+        return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
+}
+static struct tcp_congestion_ops tcp_scalable = {
+        .ssthresh       = tcp_scalable_ssthresh,
+        .cong_avoid     = tcp_scalable_cong_avoid,
+        .min_cwnd       = tcp_reno_min_cwnd,
+        .owner          = THIS_MODULE,
+        .name           = "scalable",
+};
+static int __init tcp_scalable_register(void)
+{
+        return tcp_register_congestion_control(&tcp_scalable);
+}
+static void __exit tcp_scalable_unregister(void)
+{
+        tcp_unregister_congestion_control(&tcp_scalable);
+}
+module_init(tcp_scalable_register);
+module_exit(tcp_scalable_unregister);
+MODULE_AUTHOR("John Heffner");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Scalable TCP");
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b127b4498565..0084227438c2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -231,11 +231,10 @@ static void tcp_delack_timer(unsigned long data)
        }
        tp->ack.pending &= ~TCP_ACK_TIMER;
-        if (skb_queue_len(&tp->ucopy.prequeue)) {
+        if (!skb_queue_empty(&tp->ucopy.prequeue)) {
                struct sk_buff *skb;
-                NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED, 
+                NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
-                                 skb_queue_len(&tp->ucopy.prequeue));
                while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
                        sk->sk_backlog_rcv(sk, skb);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
new file mode 100644
index 000000000000..9bd443db5193
--- /dev/null
+++ b/net/ipv4/tcp_vegas.c
@@ -0,0 +1,411 @@
+/*
+ * TCP Vegas congestion control
+ *
+ * This is based on the congestion detection/avoidance scheme described in
+ *    Lawrence S. Brakmo and Larry L. Peterson.
+ *    "TCP Vegas: End to end congestion avoidance on a global internet."
+ *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
+ *    October 1995. Available from:
+ *      ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
+ *
+ * See http://www.cs.arizona.edu/xkernel/ for their implementation.
+ * The main aspects that distinguish this implementation from the
+ * Arizona Vegas implementation are:
+ *   o We do not change the loss detection or recovery mechanisms of
+ *     Linux in any way. Linux already recovers from losses quite well,
+ *     using fine-grained timers, NewReno, and FACK.
+ *   o To avoid the performance penalty imposed by increasing cwnd
+ *     only every-other RTT during slow start, we increase during
+ *     every RTT during slow start, just like Reno.
+ *   o Largely to allow continuous cwnd growth during slow start,
+ *     we use the rate at which ACKs come back as the "actual"
+ *     rate, rather than the rate at which data is sent.
+ *   o To speed convergence to the right rate, we set the cwnd
+ *     to achieve the right ("actual") rate when we exit slow start.
+ *   o To filter out the noise caused by delayed ACKs, we use the
+ *     minimum RTT sample observed during the last RTT to calculate
+ *     the actual rate.
+ *   o When the sender re-starts from idle, it waits until it has
+ *     received ACKs for an entire flight of new data before making
+ *     a cwnd adjustment decision. The original Vegas implementation
+ *     assumed senders never went idle.
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp_diag.h>
+#include <net/tcp.h>
+/* Default values of the Vegas variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+static int alpha = 1<<V_PARAM_SHIFT;
+static int beta  = 3<<V_PARAM_SHIFT;
+static int gamma = 1<<V_PARAM_SHIFT;
+module_param(alpha, int, 0644);
+MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)");
+module_param(beta, int, 0644);
+MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)");
+module_param(gamma, int, 0644);
+MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
+/* Vegas variables */
+struct vegas {
+        u32     beg_snd_nxt;    /* right edge during last RTT */
+        u32     beg_snd_una;    /* left edge  during last RTT */
+        u32     beg_snd_cwnd;   /* saves the size of the cwnd */
+        u8      doing_vegas_now;/* if true, do vegas for this RTT */
+        u16     cntRTT;         /* # of RTTs measured within last RTT */
+        u32     minRTT;         /* min of RTTs measured within last RTT (in usec) */
+        u32     baseRTT;        /* the min of all Vegas RTT measurements seen (in usec) */
+};
+/* There are several situations when we must "re-start" Vegas:
+ *
+ *  o when a connection is established
+ *  o after an RTO
+ *  o after fast recovery
+ *  o when we send a packet and there is no outstanding
+ *    unacknowledged data (restarting an idle connection)
+ *
+ * In these circumstances we cannot do a Vegas calculation at the
+ * end of the first RTT, because any calculation we do is using
+ * stale info -- both the saved cwnd and congestion feedback are
+ * stale.
+ *
+ * Instead we must wait until the completion of an RTT during
+ * which we actually receive ACKs.
+ */
+static inline void vegas_enable(struct tcp_sock *tp)
+{
+        struct vegas *vegas = tcp_ca(tp);
+        /* Begin taking Vegas samples next time we send something. */
+        vegas->doing_vegas_now = 1;
+        /* Set the beginning of the next send window. */
+        vegas->beg_snd_nxt = tp->snd_nxt;
+        vegas->cntRTT = 0;
+        vegas->minRTT = 0x7fffffff;
+}
+/* Stop taking Vegas samples for now. */
+static inline void vegas_disable(struct tcp_sock *tp)
+{
+        struct vegas *vegas = tcp_ca(tp);
+        vegas->doing_vegas_now = 0;
+}
+static void tcp_vegas_init(struct tcp_sock *tp)
+{
+        struct vegas *vegas = tcp_ca(tp);
+        vegas->baseRTT = 0x7fffffff;
+        vegas_enable(tp);
+}
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ *   o min-filter RTT samples from within an RTT to get the current
+ *     propagation delay + queuing delay (we are min-filtering to try to
+ *     avoid the effects of delayed ACKs)
+ *   o min-filter RTT samples from a much longer window (forever for now)
+ *     to find the propagation delay (baseRTT)
+ */
+static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt)
+{
+        struct vegas *vegas = tcp_ca(tp);
+        u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */
+        /* Filter to find propagation delay: */
+        if (vrtt < vegas->baseRTT)
+                vegas->baseRTT = vrtt;
+        /* Find the min RTT during the last RTT to find
+         * the current prop. delay + queuing delay:
+         */
+        vegas->minRTT = min(vegas->minRTT, vrtt);
+        vegas->cntRTT++;
+}
+static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state)
+{
+        if (ca_state == TCP_CA_Open)
+                vegas_enable(tp);
+        else
+                vegas_disable(tp);
+}
+/*
+ * If the connection is idle and we are restarting,
+ * then we don't want to do any Vegas calculations
+ * until we get fresh RTT samples.  So when we
+ * restart, we reset our Vegas state to a clean
+ * slate. After we get acks for this flight of
+ * packets, _then_ we can make Vegas calculations
+ * again.
+ */
+static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event)
+{
+        if (event == CA_EVENT_CWND_RESTART ||
+            event == CA_EVENT_TX_START)
+                tcp_vegas_init(tp);
+}
+static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack,
+                                 u32 seq_rtt, u32 in_flight, int flag)
+{
+        struct vegas *vegas = tcp_ca(tp);
+        if (!vegas->doing_vegas_now)
+                return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag);
+        /* The key players are v_beg_snd_una and v_beg_snd_nxt.
+         *
+         * These are so named because they represent the approximate values
+         * of snd_una and snd_nxt at the beginning of the current RTT. More
+         * precisely, they represent the amount of data sent during the RTT.
+         * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+         * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
+         * bytes of data have been ACKed during the course of the RTT, giving
+         * an "actual" rate of:
+         *
+         *     (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
+         *
+         * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
+         * because delayed ACKs can cover more than one segment, so they
+         * don't line up nicely with the boundaries of RTTs.
+         *
+         * Another unfortunate fact of life is that delayed ACKs delay the
+         * advance of the left edge of our send window, so that the number
+         * of bytes we send in an RTT is often less than our cwnd will allow.
+         * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+         */
+        if (after(ack, vegas->beg_snd_nxt)) {
+                /* Do the Vegas once-per-RTT cwnd adjustment. */
+                u32 old_wnd, old_snd_cwnd;
+                /* Here old_wnd is essentially the window of data that was
+                 * sent during the previous RTT, and has all
+                 * been acknowledged in the course of the RTT that ended
+                 * with the ACK we just received. Likewise, old_snd_cwnd
+                 * is the cwnd during the previous RTT.
+                 */
+                old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
+                        tp->mss_cache;
+                old_snd_cwnd = vegas->beg_snd_cwnd;
+                /* Save the extent of the current window so we can use this
+                 * at the end of the next RTT.
+                 */
+                vegas->beg_snd_una  = vegas->beg_snd_nxt;
+                vegas->beg_snd_nxt  = tp->snd_nxt;
+                vegas->beg_snd_cwnd = tp->snd_cwnd;
+                /* Take into account the current RTT sample too, to
+                 * decrease the impact of delayed acks. This double counts
+                 * this sample since we count it for the next window as well,
+                 * but that's not too awful, since we're taking the min,
+                 * rather than averaging.
+                 */
+                tcp_vegas_rtt_calc(tp, seq_rtt*1000);
+                /* We do the Vegas calculations only if we got enough RTT
+                 * samples that we can be reasonably sure that we got
+                 * at least one RTT sample that wasn't from a delayed ACK.
+                 * If we only had 2 samples total,
+                 * then that means we're getting only 1 ACK per RTT, which
+                 * means they're almost certainly delayed ACKs.
+                 * If  we have 3 samples, we should be OK.
+                 */
+                if (vegas->cntRTT <= 2) {
+                        /* We don't have enough RTT samples to do the Vegas
+                         * calculation, so we'll behave like Reno.
+                         */
+                        if (tp->snd_cwnd > tp->snd_ssthresh)
+                                tp->snd_cwnd++;
+                } else {
+                        u32 rtt, target_cwnd, diff;
+                        /* We have enough RTT samples, so, using the Vegas
+                         * algorithm, we determine if we should increase or
+                         * decrease cwnd, and by how much.
+                         */
+                        /* Pluck out the RTT we are using for the Vegas
+                         * calculations. This is the min RTT seen during the
+                         * last RTT. Taking the min filters out the effects
+                         * of delayed ACKs, at the cost of noticing congestion
+                         * a bit later.
+                         */
+                        rtt = vegas->minRTT;
+                        /* Calculate the cwnd we should have, if we weren't
+                         * going too fast.
+                         *
+                         * This is:
+                         *     (actual rate in segments) * baseRTT
+                         * We keep it as a fixed point number with
+                         * V_PARAM_SHIFT bits to the right of the binary point.
+                         */
+                        target_cwnd = ((old_wnd * vegas->baseRTT)
+                                       << V_PARAM_SHIFT) / rtt;
+                        /* Calculate the difference between the window we had,
+                         * and the window we would like to have. This quantity
+                         * is the "Diff" from the Arizona Vegas papers.
+                         *
+                         * Again, this is a fixed point number with
+                         * V_PARAM_SHIFT bits to the right of the binary
+                         * point.
+                         */
+                        diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
+                        if (tp->snd_cwnd < tp->snd_ssthresh) {
+                                /* Slow start.  */
+                                if (diff > gamma) {
+                                        /* Going too fast. Time to slow down
+                                         * and switch to congestion avoidance.
+                                         */
+                                        tp->snd_ssthresh = 2;
+                                        /* Set cwnd to match the actual rate
+                                         * exactly:
+                                         *   cwnd = (actual rate) * baseRTT
+                                         * Then we add 1 because the integer
+                                         * truncation robs us of full link
+                                         * utilization.
+                                         */
+                                        tp->snd_cwnd = min(tp->snd_cwnd,
+                                                           (target_cwnd >>
+                                                            V_PARAM_SHIFT)+1);
+                                }
+                        } else {
+                                /* Congestion avoidance. */
+                                u32 next_snd_cwnd;
+                                /* Figure out where we would like cwnd
+                                 * to be.
+                                 */
+                                if (diff > beta) {
+                                        /* The old window was too fast, so
+                                         * we slow down.
+                                         */
+                                        next_snd_cwnd = old_snd_cwnd - 1;
+                                } else if (diff < alpha) {
+                                        /* We don't have enough extra packets
+                                         * in the network, so speed up.
+                                         */
+                                        next_snd_cwnd = old_snd_cwnd + 1;
+                                } else {
+                                        /* Sending just as fast as we
+                                         * should be.
+                                         */
+                                        next_snd_cwnd = old_snd_cwnd;
+                                }
+                                /* Adjust cwnd upward or downward, toward the
+                                 * desired value.
+                                 */
+                                if (next_snd_cwnd > tp->snd_cwnd)
+                                        tp->snd_cwnd++;
+                                else if (next_snd_cwnd < tp->snd_cwnd)
+                                        tp->snd_cwnd--;
+                        }
+                }
+                /* Wipe the slate clean for the next RTT. */
+                vegas->cntRTT = 0;
+                vegas->minRTT = 0x7fffffff;
+        }
+        /* The following code is executed for every ack we receive,
+         * except for conditions checked in should_advance_cwnd()
+         * before the call to tcp_cong_avoid(). Mainly this means that
+         * we only execute this code if the ack actually acked some
+         * data.
+         */
+        /* If we are in slow start, increase our cwnd in response to this ACK.
+         * (If we are not in slow start then we are in congestion avoidance,
+         * and adjust our congestion window only once per RTT. See the code
+         * above.)
+         */
+        if (tp->snd_cwnd <= tp->snd_ssthresh)
+                tp->snd_cwnd++;
+        /* to keep cwnd from growing without bound */
+        tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+        /* Make sure that we are never so timid as to reduce our cwnd below
+         * 2 MSS.
+         *
+         * Going below 2 MSS would risk huge delayed ACKs from our receiver.
+         */
+        tp->snd_cwnd = max(tp->snd_cwnd, 2U);
+}
+/* Extract info for Tcp socket info provided via netlink. */
+static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext,
+                               struct sk_buff *skb)
+{
+        const struct vegas *ca = tcp_ca(tp);
+        if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+                struct tcpvegas_info *info;
+                info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO,
+                                          sizeof(*info)));
+                info->tcpv_enabled = ca->doing_vegas_now;
+                info->tcpv_rttcnt = ca->cntRTT;
+                info->tcpv_rtt = ca->baseRTT;
+                info->tcpv_minrtt = ca->minRTT;
+        rtattr_failure: ;
+        }
+}
+static struct tcp_congestion_ops tcp_vegas = {
+        .init           = tcp_vegas_init,
+        .ssthresh       = tcp_reno_ssthresh,
+        .cong_avoid     = tcp_vegas_cong_avoid,
+        .min_cwnd       = tcp_reno_min_cwnd,
+        .rtt_sample     = tcp_vegas_rtt_calc,
+        .set_state      = tcp_vegas_state,
+        .cwnd_event     = tcp_vegas_cwnd_event,
+        .get_info       = tcp_vegas_get_info,
+        .owner          = THIS_MODULE,
+        .name           = "vegas",
+};
+static int __init tcp_vegas_register(void)
+{
+        BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE);
+        tcp_register_congestion_control(&tcp_vegas);
+        return 0;
+}
+static void __exit tcp_vegas_unregister(void)
+{
+        tcp_unregister_congestion_control(&tcp_vegas);
+}
+module_init(tcp_vegas_register);
+module_exit(tcp_vegas_unregister);
+MODULE_AUTHOR("Stephen Hemminger");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Vegas");
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
new file mode 100644
index 000000000000..ef827242c940
--- /dev/null
+++ b/net/ipv4/tcp_westwood.c
@@ -0,0 +1,259 @@
+/*
+ * TCP Westwood+
+ *
+ *      Angelo Dell'Aera:       TCP Westwood+ support
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp_diag.h>
+#include <net/tcp.h>
+/* TCP Westwood structure */
+struct westwood {
+        u32    bw_ns_est;        /* first bandwidth estimation..not too smoothed 8) */
+        u32    bw_est;           /* bandwidth estimate */
+        u32    rtt_win_sx;       /* here starts a new evaluation... */
+        u32    bk;
+        u32    snd_una;          /* used for evaluating the number of acked bytes */
+        u32    cumul_ack;
+        u32    accounted;
+        u32    rtt;
+        u32    rtt_min;          /* minimum observed RTT */
+};
+/* TCP Westwood functions and constants */
+#define TCP_WESTWOOD_RTT_MIN   (HZ/20)  /* 50ms */
+#define TCP_WESTWOOD_INIT_RTT  (20*HZ)  /* maybe too conservative?! */
+/*
+ * @tcp_westwood_create
+ * This function initializes fields used in TCP Westwood+,
+ * it is called after the initial SYN, so the sequence numbers
+ * are correct but new passive connections we have no
+ * information about RTTmin at this time so we simply set it to
+ * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
+ * since in this way we're sure it will be updated in a consistent
+ * way as soon as possible. It will reasonably happen within the first
+ * RTT period of the connection lifetime.
+ */
+static void tcp_westwood_init(struct tcp_sock *tp)
+{
+        struct westwood *w = tcp_ca(tp);
+        w->bk = 0;
+        w->bw_ns_est = 0;
+        w->bw_est = 0;
+        w->accounted = 0;
+        w->cumul_ack = 0;
+        w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT;
+        w->rtt_win_sx = tcp_time_stamp;
+        w->snd_una = tp->snd_una;
+}
+/*
+ * @westwood_do_filter
+ * Low-pass filter. Implemented using constant coefficients.
+ */
+static inline u32 westwood_do_filter(u32 a, u32 b)
+{
+        return (((7 * a) + b) >> 3);
+}
+static inline void westwood_filter(struct westwood *w, u32 delta)
+{
+        w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta);
+        w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est);
+}
+/*
+ * @westwood_pkts_acked
+ * Called after processing group of packets.
+ * but all westwood needs is the last sample of srtt.
+ */
+static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt)
+{
+        struct westwood *w = tcp_ca(tp);
+        if (cnt > 0)
+                w->rtt = tp->srtt >> 3;
+}
+/*
+ * @westwood_update_window
+ * It updates RTT evaluation window if it is the right moment to do
+ * it. If so it calls filter for evaluating bandwidth.
+ */
+static void westwood_update_window(struct tcp_sock *tp)
+{
+        struct westwood *w = tcp_ca(tp);
+        s32 delta = tcp_time_stamp - w->rtt_win_sx;
+        /*
+         * See if a RTT-window has passed.
+         * Be careful since if RTT is less than
+         * 50ms we don't filter but we continue 'building the sample'.
+         * This minimum limit was chosen since an estimation on small
+         * time intervals is better to avoid...
+         * Obviously on a LAN we reasonably will always have
+         * right_bound = left_bound + WESTWOOD_RTT_MIN
+         */
+        if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) {
+                westwood_filter(w, delta);
+                w->bk = 0;
+                w->rtt_win_sx = tcp_time_stamp;
+        }
+}
+/*
+ * @westwood_fast_bw
+ * It is called when we are in fast path. In particular it is called when
+ * header prediction is successful. In such case in fact update is
+ * straight forward and doesn't need any particular care.
+ */
+static inline void westwood_fast_bw(struct tcp_sock *tp)
+{
+        struct westwood *w = tcp_ca(tp);
+        westwood_update_window(tp);
+        w->bk += tp->snd_una - w->snd_una;
+        w->snd_una = tp->snd_una;
+        w->rtt_min = min(w->rtt, w->rtt_min);
+}
+/*
+ * @westwood_acked_count
+ * This function evaluates cumul_ack for evaluating bk in case of
+ * delayed or partial acks.
+ */
+static inline u32 westwood_acked_count(struct tcp_sock *tp)
+{
+        struct westwood *w = tcp_ca(tp);
+        w->cumul_ack = tp->snd_una - w->snd_una;
+        /* If cumul_ack is 0 this is a dupack since it's not moving
+         * tp->snd_una.
+         */
+        if (!w->cumul_ack) {
+                w->accounted += tp->mss_cache;
+                w->cumul_ack = tp->mss_cache;
+        }
+        if (w->cumul_ack > tp->mss_cache) {
+                /* Partial or delayed ack */
+                if (w->accounted >= w->cumul_ack) {
+                        w->accounted -= w->cumul_ack;
+                        w->cumul_ack = tp->mss_cache;
+                } else {
+                        w->cumul_ack -= w->accounted;
+                        w->accounted = 0;
+                }
+        }
+        w->snd_una = tp->snd_una;
+        return w->cumul_ack;
+}
+static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp)
+{
+        struct westwood *w = tcp_ca(tp);
+        return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2);
+}
+/*
+ * TCP Westwood
+ * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it
+ * in packets we use mss_cache). Rttmin is guaranteed to be >= 2
+ * so avoids ever returning 0.
+ */
+static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp)
+{
+        return westwood_bw_rttmin(tp);
+}
+static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event)
+{
+        struct westwood *w = tcp_ca(tp);
+        switch(event) {
+        case CA_EVENT_FAST_ACK:
+                westwood_fast_bw(tp);
+                break;
+        case CA_EVENT_COMPLETE_CWR:
+                tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp);
+                break;
+        case CA_EVENT_FRTO:
+                tp->snd_ssthresh = westwood_bw_rttmin(tp);
+                break;
+        case CA_EVENT_SLOW_ACK:
+                westwood_update_window(tp);
+                w->bk += westwood_acked_count(tp);
+                w->rtt_min = min(w->rtt, w->rtt_min);
+                break;
+        default:
+                /* don't care */
+                break;
+        }
+}
+/* Extract info for Tcp socket info provided via netlink. */
+static void tcp_westwood_info(struct tcp_sock *tp, u32 ext,
+                              struct sk_buff *skb)
+{
+        const struct westwood *ca = tcp_ca(tp);
+        if (ext & (1<<(TCPDIAG_VEGASINFO-1))) {
+                struct rtattr *rta;
+                struct tcpvegas_info *info;
+                rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info));
+                info = RTA_DATA(rta);
+                info->tcpv_enabled = 1;
+                info->tcpv_rttcnt = 0;
+                info->tcpv_rtt = jiffies_to_usecs(ca->rtt);
+                info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min);
+        rtattr_failure: ;
+        }
+}
+static struct tcp_congestion_ops tcp_westwood = {
+        .init           = tcp_westwood_init,
+        .ssthresh       = tcp_reno_ssthresh,
+        .cong_avoid     = tcp_reno_cong_avoid,
+        .min_cwnd       = tcp_westwood_cwnd_min,
+        .cwnd_event     = tcp_westwood_event,
+        .get_info       = tcp_westwood_info,
+        .pkts_acked     = tcp_westwood_pkts_acked,
+        .owner          = THIS_MODULE,
+        .name           = "westwood"
+};
+static int __init tcp_westwood_register(void)
+{
+        BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE);
+        return tcp_register_congestion_control(&tcp_westwood);
+}
+static void __exit tcp_westwood_unregister(void)
+{
+        tcp_unregister_congestion_control(&tcp_westwood);
+}
+module_init(tcp_westwood_register);
+module_exit(tcp_westwood_unregister);
+MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP Westwood+");
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index e66ca9381cfd..95163cd52ae0 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -1,6 +1,26 @@
 #
 # IPv6 configuration
-# 
+#
+#   IPv6 as module will cause a CRASH if you try to unload it
+config IPV6
+        tristate "The IPv6 protocol"
+        default m
+        select CRYPTO if IPV6_PRIVACY
+        select CRYPTO_MD5 if IPV6_PRIVACY
+        ---help---
+          This is complemental support for the IP version 6.
+          You will still be able to do traditional IPv4 networking as well.
+          For general information about IPv6, see
+          <http://playground.sun.com/pub/ipng/html/ipng-main.html>.
+          For Linux IPv6 development information, see <http://www.linux-ipv6.org>.
+          For specific information about IPv6 under Linux, read the HOWTO at
+          <http://www.bieringer.de/linux/IPv6/>.
+          To compile this protocol support as a module, choose M here: the 
+          module will be called ipv6.
 config IPV6_PRIVACY
        bool "IPv6: Privacy Extensions (RFC 3041) support"
        depends on IPV6
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a54d4ef3fd35..77004b9456c0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2777,7 +2777,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
                read_lock_bh(&idev->lock);
                switch (type) {
                case UNICAST_ADDR:
-                        /* unicast address */
+                        /* unicast address incl. temp addr */
                        for (ifa = idev->addr_list; ifa;
                             ifa = ifa->if_next, ip_idx++) {
                                if (ip_idx < s_ip_idx)
@@ -2788,19 +2788,6 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
                                    NLM_F_MULTI)) <= 0)
                                        goto done;
                        }
-                        /* temp addr */
-#ifdef CONFIG_IPV6_PRIVACY
-                        for (ifa = idev->tempaddr_list; ifa; 
-                             ifa = ifa->tmp_next, ip_idx++) {
-                                if (ip_idx < s_ip_idx)
-                                        continue;
-                                if ((err = inet6_fill_ifaddr(skb, ifa, 
-                                    NETLINK_CB(cb->skb).pid, 
-                                    cb->nlh->nlmsg_seq, RTM_NEWADDR,
-                                    NLM_F_MULTI)) <= 0) 
-                                        goto done;
-                        }
-#endif
                        break;
                case MULTICAST_ADDR:
                        /* multicast address */
@@ -2923,6 +2910,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags);
        r = NLMSG_DATA(nlh);
        r->ifi_family = AF_INET6;
+        r->__ifi_pad = 0;
        r->ifi_type = dev->type;
        r->ifi_index = dev->ifindex;
        r->ifi_flags = dev_get_flags(dev);
@@ -3030,9 +3018,12 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*pmsg), flags);
        pmsg = NLMSG_DATA(nlh);
        pmsg->prefix_family = AF_INET6;
+        pmsg->prefix_pad1 = 0;
+        pmsg->prefix_pad2 = 0;
        pmsg->prefix_ifindex = idev->dev->ifindex;
        pmsg->prefix_len = pinfo->prefix_len;
        pmsg->prefix_type = pinfo->type;
+        pmsg->prefix_pad3 = 0;
        
        pmsg->prefix_flags = 0;
        if (pinfo->onlink)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 2b193e3df49a..28d9bcab0970 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -774,7 +774,6 @@ static int __init inet6_init(void)
        if (if6_proc_init())
                goto proc_if6_fail;
 #endif
-        ipv6_packet_init();
        ip6_route_init();
        ip6_flowlabel_init();
        err = addrconf_init();
@@ -791,6 +790,8 @@ static int __init inet6_init(void)
        /* Init v6 transport protocols. */
        udpv6_init();
        tcpv6_init();
+        ipv6_packet_init();
        err = 0;
 out:
        return err;
@@ -798,7 +799,6 @@ out:
 addrconf_fail:
        ip6_flowlabel_cleanup();
        ip6_route_cleanup();
-        ipv6_packet_cleanup();
 #ifdef CONFIG_PROC_FS
        if6_proc_exit();
 proc_if6_fail:
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 0e5f7499debb..b6c73da5ff35 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -244,7 +244,6 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space,
                opt_space->opt_nflen = 0;
        }
        opt_space->dst1opt = fopt->dst1opt;
-        opt_space->auth = fopt->auth;
        opt_space->opt_flen = fopt->opt_flen;
        return opt_space;
 }
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 06e7cdaeedc5..1f2c2f9e353f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -465,7 +465,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
        to->pkt_type = from->pkt_type;
        to->priority = from->priority;
        to->protocol = from->protocol;
-        to->security = from->security;
        dst_release(to->dst);
        to->dst = dst_clone(from->dst);
        to->dev = from->dev;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 562fcd14fdea..29fed6e58d0a 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -281,7 +281,7 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr)
        }
        write_unlock_bh(&ipv6_sk_mc_lock);
-        return -ENOENT;
+        return -EADDRNOTAVAIL;
 }
 static struct inet6_dev *ip6_mc_find_dev(struct in6_addr *group, int ifindex)
@@ -386,12 +386,16 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
                if (ipv6_addr_equal(&pmc->addr, group))
                        break;
        }
-        if (!pmc)               /* must have a prior join */
+        if (!pmc) {             /* must have a prior join */
+                err = -EINVAL;
                goto done;
+        }
        /* if a source filter was set, must be the same mode as before */
        if (pmc->sflist) {
-                if (pmc->sfmode != omode)
+                if (pmc->sfmode != omode) {
+                        err = -EINVAL;
                        goto done;
+                }
        } else if (pmc->sfmode != omode) {
                /* allow mode switches for empty-set filters */
                ip6_mc_add_src(idev, group, omode, 0, NULL, 0);
@@ -402,7 +406,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
        psl = pmc->sflist;
        if (!add) {
                if (!psl)
-                        goto done;
+                        goto done;      /* err = -EADDRNOTAVAIL */
                rv = !0;
                for (i=0; i<psl->sl_count; i++) {
                        rv = memcmp(&psl->sl_addr[i], source,
@@ -411,7 +415,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
                                break;
                }
                if (rv)         /* source not found */
-                        goto done;
+                        goto done;      /* err = -EADDRNOTAVAIL */
                /* special case - (INCLUDE, empty) == LEAVE_GROUP */
                if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
@@ -488,6 +492,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
        struct inet6_dev *idev;
        struct ipv6_pinfo *inet6 = inet6_sk(sk);
        struct ip6_sf_socklist *newpsl, *psl;
+        int leavegroup = 0;
        int i, err;
        group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;
@@ -503,7 +508,12 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
        if (!idev)
                return -ENODEV;
        dev = idev->dev;
-        err = -EADDRNOTAVAIL;
+        err = 0;
+        if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) {
+                leavegroup = 1;
+                goto done;
+        }
        for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) {
                if (pmc->ifindex != gsf->gf_interface)
@@ -511,8 +521,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
                if (ipv6_addr_equal(&pmc->addr, group))
                        break;
        }
-        if (!pmc)               /* must have a prior join */
+        if (!pmc) {             /* must have a prior join */
+                err = -EINVAL;
                goto done;
+        }
        if (gsf->gf_numsrc) {
                newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk,
                                IP6_SFLSIZE(gsf->gf_numsrc), GFP_ATOMIC);
@@ -544,10 +556,13 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
                (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
        pmc->sflist = newpsl;
        pmc->sfmode = gsf->gf_fmode;
+        err = 0;
 done:
        read_unlock_bh(&idev->lock);
        in6_dev_put(idev);
        dev_put(dev);
+        if (leavegroup)
+                err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group);
        return err;
 }
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2414937f2a83..f6e288dc116e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2018,14 +2018,14 @@ static int tcp_v6_init_sock(struct sock *sk)
         */
        tp->snd_ssthresh = 0x7fffffff;
        tp->snd_cwnd_clamp = ~0;
-        tp->mss_cache_std = tp->mss_cache = 536;
+        tp->mss_cache = 536;
        tp->reordering = sysctl_tcp_reordering;
        sk->sk_state = TCP_CLOSE;
        tp->af_specific = &ipv6_specific;
+        tp->ca_ops = &tcp_init_congestion_ops;
        sk->sk_write_space = sk_stream_write_space;
        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig
index a16237c0e783..980a826f5d02 100644
--- a/net/ipx/Kconfig
+++ b/net/ipx/Kconfig
@@ -1,6 +1,39 @@
 #
 # IPX configuration
 #
+config IPX
+        tristate "The IPX protocol"
+        select LLC
+        ---help---
+          This is support for the Novell networking protocol, IPX, commonly
+          used for local networks of Windows machines.  You need it if you
+          want to access Novell NetWare file or print servers using the Linux
+          Novell client ncpfs (available from
+          <ftp://platan.vc.cvut.cz/pub/linux/ncpfs/>) or from
+          within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO,
+          available from <http://www.tldp.org/docs.html#howto>).  In order
+          to do the former, you'll also have to say Y to "NCP file system
+          support", below.
+          IPX is similar in scope to IP, while SPX, which runs on top of IPX,
+          is similar to TCP. There is also experimental support for SPX in
+          Linux (see "SPX networking", below).
+          To turn your Linux box into a fully featured NetWare file server and
+          IPX router, say Y here and fetch either lwared from
+          <ftp://ibiblio.org/pub/Linux/system/network/daemons/> or
+          mars_nwe from <ftp://www.compu-art.de/mars_nwe/>. For more
+          information, read the IPX-HOWTO available from
+          <http://www.tldp.org/docs.html#howto>.
+          General information about how to connect Linux, Windows machines and
+          Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
+          The IPX driver would enlarge your kernel by about 16 KB. To compile
+          this driver as a module, choose M here: the module will be called ipx.
+          Unless you want to integrate your Linux box with a local Novell
+          network, say N.
 config IPX_INTERN
        bool "IPX: Full internal IPX network"
        depends on IPX
diff --git a/net/irda/irlap.c b/net/irda/irlap.c
index 046ad0750e48..7029618f5719 100644
--- a/net/irda/irlap.c
+++ b/net/irda/irlap.c
@@ -445,9 +445,8 @@ void irlap_disconnect_request(struct irlap_cb *self)
        IRDA_ASSERT(self->magic == LAP_MAGIC, return;);
        /* Don't disconnect until all data frames are successfully sent */
-        if (skb_queue_len(&self->txq) > 0) {
+        if (!skb_queue_empty(&self->txq)) {
                self->disconnect_pending = TRUE;
                return;
        }
diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c
index 1cd89f5f3b75..a505b5457608 100644
--- a/net/irda/irlap_event.c
+++ b/net/irda/irlap_event.c
@@ -191,7 +191,7 @@ static void irlap_start_poll_timer(struct irlap_cb *self, int timeout)
         * Send out the RR frames faster if our own transmit queue is empty, or
         * if the peer is busy. The effect is a much faster conversation
         */
-        if ((skb_queue_len(&self->txq) == 0) || (self->remote_busy)) {
+        if (skb_queue_empty(&self->txq) || self->remote_busy) {
                if (self->fast_RR == TRUE) {
                        /*
                         *  Assert that the fast poll timer has not reached the
@@ -263,7 +263,7 @@ void irlap_do_event(struct irlap_cb *self, IRLAP_EVENT event,
                IRDA_DEBUG(2, "%s() : queue len = %d\n", __FUNCTION__,
                           skb_queue_len(&self->txq));
-                if (skb_queue_len(&self->txq)) {
+                if (!skb_queue_empty(&self->txq)) {
                        /* Prevent race conditions with irlap_data_request() */
                        self->local_busy = TRUE;
@@ -1074,7 +1074,7 @@ static int irlap_state_xmit_p(struct irlap_cb *self, IRLAP_EVENT event,
 #else   /* CONFIG_IRDA_DYNAMIC_WINDOW */
                        /* Window has been adjusted for the max packet
                         * size, so much simpler... - Jean II */
-                        nextfit = (skb_queue_len(&self->txq) > 0);
+                        nextfit = !skb_queue_empty(&self->txq);
 #endif  /* CONFIG_IRDA_DYNAMIC_WINDOW */
                        /*
                         *  Send data with poll bit cleared only if window > 1
@@ -1814,7 +1814,7 @@ static int irlap_state_xmit_s(struct irlap_cb *self, IRLAP_EVENT event,
 #else   /* CONFIG_IRDA_DYNAMIC_WINDOW */
                        /* Window has been adjusted for the max packet
                         * size, so much simpler... - Jean II */
-                        nextfit = (skb_queue_len(&self->txq) > 0);
+                        nextfit = !skb_queue_empty(&self->txq);
 #endif /* CONFIG_IRDA_DYNAMIC_WINDOW */
                        /*
                         *  Send data with final bit cleared only if window > 1
@@ -1937,7 +1937,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
                                irlap_data_indication(self, skb, FALSE);
                                /* Any pending data requests?  */
-                                if ((skb_queue_len(&self->txq) > 0) &&
+                                if (!skb_queue_empty(&self->txq) &&
                                    (self->window > 0))
                                {
                                        self->ack_required = TRUE;
@@ -2038,7 +2038,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
                        /*
                         *  Any pending data requests?
                         */
-                        if ((skb_queue_len(&self->txq) > 0) &&
+                        if (!skb_queue_empty(&self->txq) &&
                            (self->window > 0) && !self->remote_busy)
                        {
                                irlap_data_indication(self, skb, TRUE);
@@ -2069,7 +2069,7 @@ static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event,
                 */
                nr_status = irlap_validate_nr_received(self, info->nr);
                if (nr_status == NR_EXPECTED) {
-                        if ((skb_queue_len( &self->txq) > 0) &&
+                        if (!skb_queue_empty(&self->txq) &&
                            (self->window > 0)) {
                                self->remote_busy = FALSE;
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
index 040abe714aa3..6dafbb43b529 100644
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -1018,11 +1018,10 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
        /*
         *  We can now fill the window with additional data frames
         */
-        while (skb_queue_len( &self->txq) > 0) {
+        while (!skb_queue_empty(&self->txq)) {
                IRDA_DEBUG(0, "%s(), sending additional frames!\n", __FUNCTION__);
-                if ((skb_queue_len( &self->txq) > 0) &&
+                if (self->window > 0) {
-                    (self->window > 0)) {
                        skb = skb_dequeue( &self->txq);
                        IRDA_ASSERT(skb != NULL, return;);
@@ -1031,8 +1030,7 @@ void irlap_resend_rejected_frames(struct irlap_cb *self, int command)
                         *  bit cleared
                         */
                        if ((self->window > 1) &&
-                            skb_queue_len(&self->txq) > 0)
+                            !skb_queue_empty(&self->txq)) {
-                        {
                                irlap_send_data_primary(self, skb);
                        } else {
                                irlap_send_data_primary_poll(self, skb);
diff --git a/net/irda/irttp.c b/net/irda/irttp.c
index d091ccf773b3..6602d901f8b1 100644
--- a/net/irda/irttp.c
+++ b/net/irda/irttp.c
@@ -1513,7 +1513,7 @@ int irttp_disconnect_request(struct tsap_cb *self, struct sk_buff *userdata,
        /*
         *  Check if there is still data segments in the transmit queue
         */
-        if (skb_queue_len(&self->tx_queue) > 0) {
+        if (!skb_queue_empty(&self->tx_queue)) {
                if (priority == P_HIGH) {
                        /*
                         *  No need to send the queued data, if we are
diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig
new file mode 100644
index 000000000000..f0b5efb31a00
--- /dev/null
+++ b/net/lapb/Kconfig
@@ -0,0 +1,22 @@
+#
+# LAPB Data Link Drive
+#
+config LAPB
+        tristate "LAPB Data Link Driver (EXPERIMENTAL)"
+        depends on EXPERIMENTAL
+        ---help---
+          Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
+          the lower) part of the X.25 protocol. It offers a reliable
+          connection service to exchange data frames with one other host, and
+          it is used to transport higher level protocols (mostly X.25 Packet
+          Layer, the higher part of X.25, but others are possible as well).
+          Usually, LAPB is used with specialized X.21 network cards, but Linux
+          currently supports LAPB only over Ethernet connections. If you want
+          to use LAPB connections over Ethernet, say Y here and to "LAPB over
+          Ethernet driver" below. Read
+          <file:Documentation/networking/lapb-module.txt> for technical
+          details.
+          To compile this driver as a module, choose M here: the
+          module will be called lapb.  If unsure, say N.
diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c
index cd130c3b72bc..d5bdb53a348f 100644
--- a/net/llc/llc_c_ev.c
+++ b/net/llc/llc_c_ev.c
@@ -84,7 +84,7 @@ static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr)
        if (llc->dev->flags & IFF_LOOPBACK)
                goto out;
        rc = 1;
-        if (!skb_queue_len(&llc->pdu_unack_q))
+        if (skb_queue_empty(&llc->pdu_unack_q))
                goto out;
        skb = skb_peek(&llc->pdu_unack_q);
        pdu = llc_pdu_sn_hdr(skb);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 70bcd4744d93..3405fdf41b93 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -315,8 +315,8 @@ err:
 static void netlink_remove(struct sock *sk)
 {
        netlink_table_grab();
-        nl_table[sk->sk_protocol].hash.entries--;
+        if (sk_del_node_init(sk))
-        sk_del_node_init(sk);
+                nl_table[sk->sk_protocol].hash.entries--;
        if (nlk_sk(sk)->groups)
                __sk_del_bind_node(sk);
        netlink_table_ungrab();
@@ -429,7 +429,12 @@ retry:
        err = netlink_insert(sk, pid);
        if (err == -EADDRINUSE)
                goto retry;
-        return 0;
+        /* If 2 threads race to autobind, that is fine.  */
+        if (err == -EBUSY)
+                err = 0;
+        return err;
 }
 static inline int netlink_capable(struct socket *sock, unsigned int flag) 
@@ -853,7 +858,7 @@ static inline void netlink_rcv_wake(struct sock *sk)
 {
        struct netlink_sock *nlk = nlk_sk(sk);
-        if (!skb_queue_len(&sk->sk_receive_queue))
+        if (skb_queue_empty(&sk->sk_receive_queue))
                clear_bit(0, &nlk->state);
        if (!test_bit(0, &nlk->state))
                wake_up_interruptible(&nlk->wait);
diff --git a/net/packet/Kconfig b/net/packet/Kconfig
new file mode 100644
index 000000000000..34ff93ff894d
--- /dev/null
+++ b/net/packet/Kconfig
@@ -0,0 +1,26 @@
+#
+# Packet configuration
+#
+config PACKET
+        tristate "Packet socket"
+        ---help---
+          The Packet protocol is used by applications which communicate
+          directly with network devices without an intermediate network
+          protocol implemented in the kernel, e.g. tcpdump.  If you want them
+          to work, choose Y.
+          To compile this driver as a module, choose M here: the module will
+          be called af_packet.
+          If unsure, say Y.
+config PACKET_MMAP
+        bool "Packet socket: mmapped IO"
+        depends on PACKET
+        help
+          If you say Y here, the Packet protocol driver will use an IO
+          mechanism that results in faster communication.
+          If unsure, say N.
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0269616e75a1..c9d5980aa4de 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -274,6 +274,9 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct
        dst_release(skb->dst);
        skb->dst = NULL;
+        /* drop conntrack reference */
+        nf_reset(skb);
        spkt = (struct sockaddr_pkt*)skb->cb;
        skb_push(skb, skb->data-skb->mac.raw);
@@ -517,6 +520,9 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,  struct packe
        dst_release(skb->dst);
        skb->dst = NULL;
+        /* drop conntrack reference */
+        nf_reset(skb);
        spin_lock(&sk->sk_receive_queue.lock);
        po->stats.tp_packets++;
        __skb_queue_tail(&sk->sk_receive_queue, skb);
diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c
index 2b537f425a17..dada34a77b21 100644
--- a/net/rxrpc/krxiod.c
+++ b/net/rxrpc/krxiod.c
@@ -138,7 +138,7 @@ static int rxrpc_krxiod(void *arg)
                _debug("### End Work");
-                try_to_freeze(PF_FREEZE);
+                try_to_freeze();
                /* discard pending signals */
                rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c
index 6020c89d9228..1aadd026d354 100644
--- a/net/rxrpc/krxsecd.c
+++ b/net/rxrpc/krxsecd.c
@@ -107,7 +107,7 @@ static int rxrpc_krxsecd(void *arg)
                _debug("### End Inbound Calls");
-                try_to_freeze(PF_FREEZE);
+                try_to_freeze();
                /* discard pending signals */
                rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c
index 249c2b0290bb..3ac81cdd1211 100644
--- a/net/rxrpc/krxtimod.c
+++ b/net/rxrpc/krxtimod.c
@@ -90,7 +90,7 @@ static int krxtimod(void *arg)
                        complete_and_exit(&krxtimod_dead, 0);
                }
-                try_to_freeze(PF_FREEZE);
+                try_to_freeze();
                /* discard pending signals */
                rxrpc_discard_my_signals();
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index b22c9beb604d..59d3e71f8b85 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -1,6 +1,43 @@
 #
 # Traffic control configuration.
 # 
+menuconfig NET_SCHED
+        bool "QoS and/or fair queueing"
+        ---help---
+          When the kernel has several packets to send out over a network
+          device, it has to decide which ones to send first, which ones to
+          delay, and which ones to drop. This is the job of the packet
+          scheduler, and several different algorithms for how to do this
+          "fairly" have been proposed.
+          If you say N here, you will get the standard packet scheduler, which
+          is a FIFO (first come, first served). If you say Y here, you will be
+          able to choose from among several alternative algorithms which can
+          then be attached to different network devices. This is useful for
+          example if some of your network devices are real time devices that
+          need a certain minimum data flow rate, or if you need to limit the
+          maximum data flow rate for traffic which matches specified criteria.
+          This code is considered to be experimental.
+          To administer these schedulers, you'll need the user-level utilities
+          from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
+          That package also contains some documentation; for more, check out
+          <http://snafu.freedom.org/linux2.2/iproute-notes.html>.
+          This Quality of Service (QoS) support will enable you to use
+          Differentiated Services (diffserv) and Resource Reservation Protocol
+          (RSVP) on your Linux router if you also say Y to "QoS support",
+          "Packet classifier API" and to some classifiers below. Documentation
+          and software is at <http://diffserv.sourceforge.net/>.
+          If you say Y here and to "/proc file system" below, you will be able
+          to read status information about packet schedulers from the file
+          /proc/net/psched.
+          The available schedulers are listed in the following questions; you
+          can say Y to as many as you like. If unsure, say N now.
 choice
        prompt "Packet scheduler clock source"
        depends on NET_SCHED
@@ -449,6 +486,19 @@ config NET_EMATCH_META
          To compile this code as a module, choose M here: the
          module will be called em_meta.
+config NET_EMATCH_TEXT
+        tristate "Textsearch"
+        depends on NET_EMATCH
+        select TEXTSEARCH
+        select TEXTSEARCH_KMP
+        select TEXTSEARCH_FSM
+        ---help---
+          Say Y here if you want to be ablt to classify packets based on
+          textsearch comparisons.
+          To compile this code as a module, choose M here: the
+          module will be called em_text.
 config NET_CLS_ACT
        bool "Packet ACTION"
        depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index eb3fe583eba8..e48d0d456b3e 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -4,7 +4,7 @@
 obj-y   := sch_generic.o
-obj-$(CONFIG_NET_SCHED)         += sch_api.o sch_fifo.o
+obj-$(CONFIG_NET_SCHED)         += sch_api.o sch_fifo.o sch_blackhole.o
 obj-$(CONFIG_NET_CLS)           += cls_api.o
 obj-$(CONFIG_NET_CLS_ACT)       += act_api.o
 obj-$(CONFIG_NET_ACT_POLICE)    += police.o
@@ -40,3 +40,4 @@ obj-$(CONFIG_NET_EMATCH_CMP)	+= em_cmp.o
 obj-$(CONFIG_NET_EMATCH_NBYTE)  += em_nbyte.o
 obj-$(CONFIG_NET_EMATCH_U32)    += em_u32.o
 obj-$(CONFIG_NET_EMATCH_META)   += em_meta.o
+obj-$(CONFIG_NET_EMATCH_TEXT)   += em_text.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9594206e6035..249c61936ea0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -439,6 +439,8 @@ tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq,
        t = NLMSG_DATA(nlh);
        t->tca_family = AF_UNSPEC;
+        t->tca__pad1 = 0;
+        t->tca__pad2 = 0;
        
        x = (struct rtattr*) skb->tail;
        RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -580,6 +582,8 @@ static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid)
        nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t));
        t = NLMSG_DATA(nlh);
        t->tca_family = AF_UNSPEC;
+        t->tca__pad1 = 0;
+        t->tca__pad2 = 0;
        x = (struct rtattr *) skb->tail;
        RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -687,7 +691,9 @@ static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event,
        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*t), flags);
        t = NLMSG_DATA(nlh);
        t->tca_family = AF_UNSPEC;
-        
+        t->tca__pad1 = 0;
+        t->tca__pad2 = 0;
        x = (struct rtattr*) skb->tail;
        RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
@@ -842,6 +848,8 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
                        cb->nlh->nlmsg_type, sizeof(*t));
        t = NLMSG_DATA(nlh);
        t->tca_family = AF_UNSPEC;
+        t->tca__pad1 = 0;
+        t->tca__pad2 = 0;
        x = (struct rtattr *) skb->tail;
        RTA_PUT(skb, TCA_ACT_TAB, 0, NULL);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 1616bf5c9627..3b5714ef4d1a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -331,6 +331,8 @@ tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh,
        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
        tcm = NLMSG_DATA(nlh);
        tcm->tcm_family = AF_UNSPEC;
+        tcm->tcm__pad1 = 0;
+        tcm->tcm__pad1 = 0;
        tcm->tcm_ifindex = tp->q->dev->ifindex;
        tcm->tcm_parent = tp->classid;
        tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 232fb9196810..006168d69376 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -618,6 +618,7 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
        pinfo.protocol = s->protocol;
        pinfo.tunnelid = s->tunnelid;
        pinfo.tunnelhdr = f->tunnelhdr;
+        pinfo.pad = 0;
        RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
        if (f->res.classid)
                RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid);
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 48bb23c2a35a..53d98f8d3d80 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -205,11 +205,6 @@ META_COLLECTOR(int_protocol)
        dst->value = skb->protocol;
 }
-META_COLLECTOR(int_security)
-{
-        dst->value = skb->security;
-}
 META_COLLECTOR(int_pkttype)
 {
        dst->value = skb->pkt_type;
@@ -524,7 +519,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
                [META_ID(REALDEV)]              = META_FUNC(int_realdev),
                [META_ID(PRIORITY)]             = META_FUNC(int_priority),
                [META_ID(PROTOCOL)]             = META_FUNC(int_protocol),
-                [META_ID(SECURITY)]             = META_FUNC(int_security),
                [META_ID(PKTTYPE)]              = META_FUNC(int_pkttype),
                [META_ID(PKTLEN)]               = META_FUNC(int_pktlen),
                [META_ID(DATALEN)]              = META_FUNC(int_datalen),
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
new file mode 100644
index 000000000000..873840d8d072
--- /dev/null
+++ b/net/sched/em_text.c
@@ -0,0 +1,157 @@
+/*
+ * net/sched/em_text.c  Textsearch ematch
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Thomas Graf <tgraf@suug.ch>
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/textsearch.h>
+#include <linux/tc_ematch/tc_em_text.h>
+#include <net/pkt_cls.h>
+struct text_match
+{
+        u16                     from_offset;
+        u16                     to_offset;
+        u8                      from_layer;
+        u8                      to_layer;
+        struct ts_config        *config;
+};
+#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
+static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
+                         struct tcf_pkt_info *info)
+{
+        struct text_match *tm = EM_TEXT_PRIV(m);
+        int from, to;
+        struct ts_state state;
+        from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
+        from += tm->from_offset;
+        to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
+        to += tm->to_offset;
+        return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX;
+}
+static int em_text_change(struct tcf_proto *tp, void *data, int len,
+                          struct tcf_ematch *m)
+{
+        struct text_match *tm;
+        struct tcf_em_text *conf = data;
+        struct ts_config *ts_conf;
+        int flags = 0;
+        printk("Configuring text: %s from %d:%d to %d:%d len %d\n", conf->algo, conf->from_offset,
+            conf->from_layer, conf->to_offset, conf->to_layer, conf->pattern_len);
+        if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
+                return -EINVAL;
+        if (conf->from_layer > conf->to_layer)
+                return -EINVAL;
+        if (conf->from_layer == conf->to_layer &&
+            conf->from_offset > conf->to_offset)
+                return -EINVAL;
+retry:
+        ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
+                                     conf->pattern_len, GFP_KERNEL, flags);
+        if (flags & TS_AUTOLOAD)
+                rtnl_lock();
+        if (IS_ERR(ts_conf)) {
+                if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
+                        rtnl_unlock();
+                        flags |= TS_AUTOLOAD;
+                        goto retry;
+                } else
+                        return PTR_ERR(ts_conf);
+        } else if (flags & TS_AUTOLOAD) {
+                textsearch_destroy(ts_conf);
+                return -EAGAIN;
+        }
+        tm = kmalloc(sizeof(*tm), GFP_KERNEL);
+        if (tm == NULL) {
+                textsearch_destroy(ts_conf);
+                return -ENOBUFS;
+        }
+        tm->from_offset = conf->from_offset;
+        tm->to_offset   = conf->to_offset;
+        tm->from_layer  = conf->from_layer;
+        tm->to_layer    = conf->to_layer;
+        tm->config      = ts_conf;
+        m->datalen = sizeof(*tm);
+        m->data = (unsigned long) tm;
+        return 0;
+}
+static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+        textsearch_destroy(EM_TEXT_PRIV(m)->config);
+}
+static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+        struct text_match *tm = EM_TEXT_PRIV(m);
+        struct tcf_em_text conf;
+        strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
+        conf.from_offset = tm->from_offset;
+        conf.to_offset = tm->to_offset;
+        conf.from_layer = tm->from_layer;
+        conf.to_layer = tm->to_layer;
+        conf.pattern_len = textsearch_get_pattern_len(tm->config);
+        conf.pad = 0;
+        RTA_PUT_NOHDR(skb, sizeof(conf), &conf);
+        RTA_APPEND(skb, conf.pattern_len, textsearch_get_pattern(tm->config));
+        return 0;
+rtattr_failure:
+        return -1;
+}               
+static struct tcf_ematch_ops em_text_ops = {
+        .kind     = TCF_EM_TEXT,
+        .change   = em_text_change,
+        .match    = em_text_match,
+        .destroy  = em_text_destroy,
+        .dump     = em_text_dump,
+        .owner    = THIS_MODULE,
+        .link     = LIST_HEAD_INIT(em_text_ops.link)
+};
+static int __init init_em_text(void)
+{
+        return tcf_em_register(&em_text_ops);
+}
+static void __exit exit_em_text(void) 
+{
+        tcf_em_unregister(&em_text_ops);
+}
+MODULE_LICENSE("GPL");
+module_init(init_em_text);
+module_exit(exit_em_text);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 97c1c75d5c78..b9a069af4a02 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -399,10 +399,8 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
 {
        int err;
        struct rtattr *kind = tca[TCA_KIND-1];
-        void *p = NULL;
        struct Qdisc *sch;
        struct Qdisc_ops *ops;
-        int size;
        ops = qdisc_lookup_ops(kind);
 #ifdef CONFIG_KMOD
@@ -437,64 +435,55 @@ qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
        if (ops == NULL)
                goto err_out;
-        /* ensure that the Qdisc and the private data are 32-byte aligned */
+        sch = qdisc_alloc(dev, ops);
-        size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
+        if (IS_ERR(sch)) {
-        size += ops->priv_size + QDISC_ALIGN_CONST;
+                err = PTR_ERR(sch);
-        p = kmalloc(size, GFP_KERNEL);
-        err = -ENOBUFS;
-        if (!p)
                goto err_out2;
-        memset(p, 0, size);
+        }
-        sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
-                               & ~QDISC_ALIGN_CONST);
-        sch->padded = (char *)sch - (char *)p;
-        INIT_LIST_HEAD(&sch->list);
-        skb_queue_head_init(&sch->q);
-        if (handle == TC_H_INGRESS)
+        if (handle == TC_H_INGRESS) {
                sch->flags |= TCQ_F_INGRESS;
+                handle = TC_H_MAKE(TC_H_INGRESS, 0);
-        sch->ops = ops;
+        } else if (handle == 0) {
-        sch->enqueue = ops->enqueue;
-        sch->dequeue = ops->dequeue;
-        sch->dev = dev;
-        dev_hold(dev);
-        atomic_set(&sch->refcnt, 1);
-        sch->stats_lock = &dev->queue_lock;
-        if (handle == 0) {
                handle = qdisc_alloc_handle(dev);
                err = -ENOMEM;
                if (handle == 0)
                        goto err_out3;
        }
-        if (handle == TC_H_INGRESS)
+        sch->handle = handle;
-                sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
-        else
-                sch->handle = handle;
        if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
+#ifdef CONFIG_NET_ESTIMATOR
+                if (tca[TCA_RATE-1]) {
+                        err = gen_new_estimator(&sch->bstats, &sch->rate_est,
+                                                sch->stats_lock,
+                                                tca[TCA_RATE-1]);
+                        if (err) {
+                                /*
+                                 * Any broken qdiscs that would require
+                                 * a ops->reset() here? The qdisc was never
+                                 * in action so it shouldn't be necessary.
+                                 */
+                                if (ops->destroy)
+                                        ops->destroy(sch);
+                                goto err_out3;
+                        }
+                }
+#endif
                qdisc_lock_tree(dev);
                list_add_tail(&sch->list, &dev->qdisc_list);
                qdisc_unlock_tree(dev);
-#ifdef CONFIG_NET_ESTIMATOR
-                if (tca[TCA_RATE-1])
-                        gen_new_estimator(&sch->bstats, &sch->rate_est,
-                                sch->stats_lock, tca[TCA_RATE-1]);
-#endif
                return sch;
        }
 err_out3:
        dev_put(dev);
+        kfree((char *) sch - sch->padded);
 err_out2:
        module_put(ops->owner);
 err_out:
        *errp = err;
-        if (p)
-                kfree(p);
        return NULL;
 }
@@ -770,6 +759,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
        nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
        tcm = NLMSG_DATA(nlh);
        tcm->tcm_family = AF_UNSPEC;
+        tcm->tcm__pad1 = 0;
+        tcm->tcm__pad2 = 0;
        tcm->tcm_ifindex = q->dev->ifindex;
        tcm->tcm_parent = clid;
        tcm->tcm_handle = q->handle;
diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c
new file mode 100644
index 000000000000..81f0b8346d17
--- /dev/null
+++ b/net/sched/sch_blackhole.c
@@ -0,0 +1,54 @@
+/*
+ * net/sched/sch_blackhole.c    Black hole queue
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Thomas Graf <tgraf@suug.ch>
+ *
+ * Note: Quantum tunneling is not supported.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+        qdisc_drop(skb, sch);
+        return NET_XMIT_SUCCESS;
+}
+static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
+{
+        return NULL;
+}
+static struct Qdisc_ops blackhole_qdisc_ops = {
+        .id             = "blackhole",
+        .priv_size      = 0,
+        .enqueue        = blackhole_enqueue,
+        .dequeue        = blackhole_dequeue,
+        .owner          = THIS_MODULE,
+};
+static int __init blackhole_module_init(void)
+{
+        return register_qdisc(&blackhole_qdisc_ops);
+}
+static void __exit blackhole_module_exit(void)
+{
+        unregister_qdisc(&blackhole_qdisc_ops);
+}
+module_init(blackhole_module_init)
+module_exit(blackhole_module_exit)
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index d43e3b8cbf6a..09453f997d8c 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1528,6 +1528,7 @@ static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
        opt.strategy = cl->ovl_strategy;
        opt.priority2 = cl->priority2+1;
+        opt.pad = 0;
        opt.penalty = (cl->penalty*1000)/HZ;
        RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
        return skb->len;
@@ -1563,6 +1564,8 @@ static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
        if (cl->police) {
                opt.police = cl->police;
+                opt.__res1 = 0;
+                opt.__res2 = 0;
                RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
        }
        return skb->len;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7683b34dc6a9..73e218e646ac 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -395,24 +395,23 @@ static struct Qdisc_ops pfifo_fast_ops = {
        .owner          =       THIS_MODULE,
 };
-struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
+struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
 {
        void *p;
        struct Qdisc *sch;
-        int size;
+        unsigned int size;
+        int err = -ENOBUFS;
        /* ensure that the Qdisc and the private data are 32-byte aligned */
-        size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
+        size = QDISC_ALIGN(sizeof(*sch));
-        size += ops->priv_size + QDISC_ALIGN_CONST;
+        size += ops->priv_size + (QDISC_ALIGNTO - 1);
        p = kmalloc(size, GFP_KERNEL);
        if (!p)
-                return NULL;
+                goto errout;
        memset(p, 0, size);
+        sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
-        sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) 
+        sch->padded = (char *) sch - (char *) p;
-                               & ~QDISC_ALIGN_CONST);
-        sch->padded = (char *)sch - (char *)p;
        INIT_LIST_HEAD(&sch->list);
        skb_queue_head_init(&sch->q);
@@ -423,11 +422,24 @@ struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
        dev_hold(dev);
        sch->stats_lock = &dev->queue_lock;
        atomic_set(&sch->refcnt, 1);
+        return sch;
+errout:
+        return ERR_PTR(-err);
+}
+struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops)
+{
+        struct Qdisc *sch;
+        
+        sch = qdisc_alloc(dev, ops);
+        if (IS_ERR(sch))
+                goto errout;
        if (!ops->init || ops->init(sch, NULL) == 0)
                return sch;
-        dev_put(dev);
+errout:
-        kfree(p);
        return NULL;
 }
@@ -591,6 +603,7 @@ EXPORT_SYMBOL(__netdev_watchdog_up);
 EXPORT_SYMBOL(noop_qdisc);
 EXPORT_SYMBOL(noop_qdisc_ops);
 EXPORT_SYMBOL(qdisc_create_dflt);
+EXPORT_SYMBOL(qdisc_alloc);
 EXPORT_SYMBOL(qdisc_destroy);
 EXPORT_SYMBOL(qdisc_reset);
 EXPORT_SYMBOL(qdisc_restart);
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 664d0e47374f..7845d045eec4 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -385,7 +385,7 @@ static int red_change(struct Qdisc *sch, struct rtattr *opt)
        memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
        q->qcount = -1;
-        if (skb_queue_len(&sch->q) == 0)
+        if (skb_queue_empty(&sch->q))
                PSCHED_SET_PASTPERFECT(q->qidlestart);
        sch_tree_unlock(sch);
        return 0;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 7ae6aa772dab..5b24ae0650d3 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -71,7 +71,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
                                          const struct sctp_endpoint *ep,
                                          const struct sock *sk,
                                          sctp_scope_t scope,
-                                          int gfp)
+                                          unsigned int __nocast gfp)
 {
        struct sctp_sock *sp;
        int i;
@@ -203,7 +203,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
         */
        asoc->addip_serial = asoc->c.initial_tsn;
-        skb_queue_head_init(&asoc->addip_chunks);
+        INIT_LIST_HEAD(&asoc->addip_chunk_list);
        /* Make an empty list of remote transport addresses.  */
        INIT_LIST_HEAD(&asoc->peer.transport_addr_list);
@@ -272,7 +272,8 @@ fail_init:
 /* Allocate and initialize a new association */
 struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
                                         const struct sock *sk,
-                                         sctp_scope_t scope, int gfp)
+                                         sctp_scope_t scope,
+                                         unsigned int __nocast gfp)
 {
        struct sctp_association *asoc;
@@ -478,7 +479,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
 /* Add a transport address to an association.  */
 struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
                                           const union sctp_addr *addr,
-                                           const int gfp,
+                                           const unsigned int __nocast gfp,
                                           const int peer_state)
 {
        struct sctp_transport *peer;
@@ -1229,7 +1230,8 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned len)
 /* Build the bind address list for the association based on info from the
 * local endpoint and the remote peer.
 */
-int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, int gfp)
+int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
+                                     unsigned int __nocast gfp)
 {
        sctp_scope_t scope;
        int flags;
@@ -1251,7 +1253,8 @@ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, int gfp)
 /* Build the association's bind address list from the cookie.  */
 int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
-                                         struct sctp_cookie *cookie, int gfp)
+                                         struct sctp_cookie *cookie,
+                                         unsigned int __nocast gfp)
 {
        int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length);
        int var_size3 = cookie->raw_addr_list_len;
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index f90eadfb60a2..f71549710f2e 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -53,7 +53,8 @@
 /* Forward declarations for internal helpers. */
 static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *,
-                              sctp_scope_t scope, int gfp, int flags);
+                              sctp_scope_t scope, unsigned int __nocast gfp,
+                              int flags);
 static void sctp_bind_addr_clean(struct sctp_bind_addr *);
 /* First Level Abstractions. */
@@ -63,7 +64,8 @@ static void sctp_bind_addr_clean(struct sctp_bind_addr *);
 */
 int sctp_bind_addr_copy(struct sctp_bind_addr *dest, 
                        const struct sctp_bind_addr *src,
-                        sctp_scope_t scope, int gfp, int flags)
+                        sctp_scope_t scope, unsigned int __nocast gfp,
+                        int flags)
 {
        struct sctp_sockaddr_entry *addr;
        struct list_head *pos;
@@ -144,7 +146,7 @@ void sctp_bind_addr_free(struct sctp_bind_addr *bp)
 /* Add an address to the bind address list in the SCTP_bind_addr structure. */
 int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
-                       int gfp)
+                       unsigned int __nocast gfp)
 {
        struct sctp_sockaddr_entry *addr;
@@ -197,7 +199,8 @@ int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr)
 * The second argument is the return value for the length.
 */
 union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp,
-                                         int *addrs_len, int gfp)
+                                         int *addrs_len,
+                                         unsigned int __nocast gfp)
 {
        union sctp_params addrparms;
        union sctp_params retval;
@@ -249,7 +252,7 @@ end_raw:
 * address parameters).
 */
 int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
-                           int addrs_len, __u16 port, int gfp)
+                           int addrs_len, __u16 port, unsigned int __nocast gfp)
 {
        union sctp_addr_param *rawaddr;
        struct sctp_paramhdr *param;
@@ -347,7 +350,8 @@ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr	*bp,
 /* Copy out addresses from the global local address list. */
 static int sctp_copy_one_addr(struct sctp_bind_addr *dest, 
                              union sctp_addr *addr,
-                              sctp_scope_t scope, int gfp, int flags)
+                              sctp_scope_t scope, unsigned int __nocast gfp,
+                              int flags)
 {
        int error = 0;
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 0c2ab7885058..61da2937e641 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -62,7 +62,7 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg)
 }
 /* Allocate and initialize datamsg. */
-SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(int gfp)
+SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(unsigned int __nocast gfp)
 {
        struct sctp_datamsg *msg;
        msg = kmalloc(sizeof(struct sctp_datamsg), gfp);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 2ec0320fac3b..e47ac0d1a6d6 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -67,7 +67,8 @@ static void sctp_endpoint_bh_rcv(struct sctp_endpoint *ep);
 * Initialize the base fields of the endpoint structure.
 */
 static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
-                                                struct sock *sk, int gfp)
+                                                struct sock *sk,
+                                                unsigned int __nocast gfp)
 {
        struct sctp_sock *sp = sctp_sk(sk);
        memset(ep, 0, sizeof(struct sctp_endpoint));
@@ -102,9 +103,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
        /* Set up the base timeout information.  */
        ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
        ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
-                SCTP_DEFAULT_TIMEOUT_T1_COOKIE;
+                msecs_to_jiffies(sp->rtoinfo.srto_initial);
        ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
-                SCTP_DEFAULT_TIMEOUT_T1_INIT;
+                msecs_to_jiffies(sp->rtoinfo.srto_initial);
        ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] =
                msecs_to_jiffies(sp->rtoinfo.srto_initial);
        ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
@@ -117,12 +118,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
        ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
                = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max);
-        ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] =
+        ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
-                SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
+        ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout;
-        ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
+        ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
-                SCTP_DEFAULT_TIMEOUT_SACK;
-        ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
-                sp->autoclose * HZ;
        /* Use SCTP specific send buffer space queues.  */
        ep->sndbuf_policy = sctp_sndbuf_policy;
@@ -140,7 +138,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 /* Create a sctp_endpoint with all that boring stuff initialized.
 * Returns NULL if there isn't enough memory.
 */
-struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, int gfp)
+struct sctp_endpoint *sctp_endpoint_new(struct sock *sk,
+                                        unsigned int __nocast gfp)
 {
        struct sctp_endpoint *ep;
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 339f7acfdb64..5e085e041a6e 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -115,6 +115,17 @@ static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk)
        atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc);
 }
+struct sctp_input_cb {
+        union {
+                struct inet_skb_parm    h4;
+#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
+                struct inet6_skb_parm   h6;
+#endif
+        } header;
+        struct sctp_chunk *chunk;
+};
+#define SCTP_INPUT_CB(__skb)    ((struct sctp_input_cb *)&((__skb)->cb[0]))
 /*
 * This is the routine which IP calls when receiving an SCTP packet.
 */
@@ -243,6 +254,7 @@ int sctp_rcv(struct sk_buff *skb)
                ret = -ENOMEM;
                goto discard_release;
        }
+        SCTP_INPUT_CB(skb)->chunk = chunk;
        sctp_rcv_set_owner_r(skb,sk);
@@ -265,9 +277,9 @@ int sctp_rcv(struct sk_buff *skb)
        sctp_bh_lock_sock(sk);
        if (sock_owned_by_user(sk))
-                sk_add_backlog(sk, (struct sk_buff *) chunk);
+                sk_add_backlog(sk, skb);
        else
-                sctp_backlog_rcv(sk, (struct sk_buff *) chunk);
+                sctp_backlog_rcv(sk, skb);
        /* Release the sock and any reference counts we took in the
         * lookup calls.
@@ -302,14 +314,8 @@ discard_release:
 */
 int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 {
-        struct sctp_chunk *chunk;
+        struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
-        struct sctp_inq *inqueue;
+        struct sctp_inq *inqueue = &chunk->rcvr->inqueue;
-        /* One day chunk will live inside the skb, but for
-         * now this works.
-         */
-        chunk = (struct sctp_chunk *) skb;
-        inqueue = &chunk->rcvr->inqueue;
        sctp_inq_push(inqueue, chunk);
        return 0;
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index cedf4351556c..2d33922c044b 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -50,7 +50,7 @@
 /* Initialize an SCTP inqueue.  */
 void sctp_inq_init(struct sctp_inq *queue)
 {
-        skb_queue_head_init(&queue->in);
+        INIT_LIST_HEAD(&queue->in_chunk_list);
        queue->in_progress = NULL;
        /* Create a task for delivering data.  */
@@ -62,11 +62,13 @@ void sctp_inq_init(struct sctp_inq *queue)
 /* Release the memory associated with an SCTP inqueue.  */
 void sctp_inq_free(struct sctp_inq *queue)
 {
-        struct sctp_chunk *chunk;
+        struct sctp_chunk *chunk, *tmp;
        /* Empty the queue.  */
-        while ((chunk = (struct sctp_chunk *) skb_dequeue(&queue->in)) != NULL)
+        list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) {
+                list_del_init(&chunk->list);
                sctp_chunk_free(chunk);
+        }
        /* If there is a packet which is currently being worked on,
         * free it as well.
@@ -92,7 +94,7 @@ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *packet)
         * Eventually, we should clean up inqueue to not rely
         * on the BH related data structures.
         */
-        skb_queue_tail(&(q->in), (struct sk_buff *) packet);
+        list_add_tail(&packet->list, &q->in_chunk_list);
        q->immediate.func(q->immediate.data);
 }
@@ -131,12 +133,16 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue)
        /* Do we need to take the next packet out of the queue to process? */
        if (!chunk) {
+                struct list_head *entry;
                /* Is the queue empty?  */
-                if (skb_queue_empty(&queue->in))
+                if (list_empty(&queue->in_chunk_list))
                        return NULL;
+                entry = queue->in_chunk_list.next;
                chunk = queue->in_progress =
-                        (struct sctp_chunk *) skb_dequeue(&queue->in);
+                        list_entry(entry, struct sctp_chunk, list);
+                list_del_init(entry);
                /* This is the first chunk in the packet.  */
                chunk->singleton = 1;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 84b5b370b09d..931371633464 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -108,7 +108,7 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
        packet->transport = transport;
        packet->source_port = sport;
        packet->destination_port = dport;
-        skb_queue_head_init(&packet->chunks);
+        INIT_LIST_HEAD(&packet->chunk_list);
        if (asoc) {
                struct sctp_sock *sp = sctp_sk(asoc->base.sk);  
                overhead = sp->pf->af->net_header_len; 
@@ -129,12 +129,14 @@ struct sctp_packet *sctp_packet_init(struct sctp_packet *packet,
 /* Free a packet.  */
 void sctp_packet_free(struct sctp_packet *packet)
 {
-        struct sctp_chunk *chunk;
+        struct sctp_chunk *chunk, *tmp;
        SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
-        while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL)
+        list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+                list_del_init(&chunk->list);
                sctp_chunk_free(chunk);
+        }
        if (packet->malloced)
                kfree(packet);
@@ -276,7 +278,7 @@ append:
                packet->has_sack = 1;
        /* It is OK to send this chunk.  */
-        __skb_queue_tail(&packet->chunks, (struct sk_buff *)chunk);
+        list_add_tail(&chunk->list, &packet->chunk_list);
        packet->size += chunk_len;
        chunk->transport = packet->transport;
 finish:
@@ -295,7 +297,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
        struct sctphdr *sh;
        __u32 crc32;
        struct sk_buff *nskb;
-        struct sctp_chunk *chunk;
+        struct sctp_chunk *chunk, *tmp;
        struct sock *sk;
        int err = 0;
        int padding;            /* How much padding do we need?  */
@@ -305,11 +307,11 @@ int sctp_packet_transmit(struct sctp_packet *packet)
        SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet);
        /* Do NOT generate a chunkless packet. */
-        chunk = (struct sctp_chunk *)skb_peek(&packet->chunks);
+        if (list_empty(&packet->chunk_list))
-        if (unlikely(!chunk))
                return err;
        /* Set up convenience variables... */
+        chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list);
        sk = chunk->skb->sk;
        /* Allocate the new skb.  */
@@ -370,7 +372,8 @@ int sctp_packet_transmit(struct sctp_packet *packet)
         * [This whole comment explains WORD_ROUND() below.]
         */
        SCTP_DEBUG_PRINTK("***sctp_transmit_packet***\n");
-        while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) {
+        list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+                list_del_init(&chunk->list);
                if (sctp_chunk_is_data(chunk)) {
                        if (!chunk->has_tsn) {
@@ -511,7 +514,8 @@ err:
         * will get resent or dropped later.
         */
-        while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) {
+        list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) {
+                list_del_init(&chunk->list);
                if (!sctp_chunk_is_data(chunk))
                        sctp_chunk_free(chunk);
        }
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 4eb81a1407b7..efb72faba20c 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -75,7 +75,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
 static inline void sctp_outq_head_data(struct sctp_outq *q,
                                        struct sctp_chunk *ch)
 {
-        __skb_queue_head(&q->out, (struct sk_buff *)ch);
+        list_add(&ch->list, &q->out_chunk_list);
        q->out_qlen += ch->skb->len;
        return;
 }
@@ -83,17 +83,22 @@ static inline void sctp_outq_head_data(struct sctp_outq *q,
 /* Take data from the front of the queue. */
 static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q)
 {
-        struct sctp_chunk *ch;
+        struct sctp_chunk *ch = NULL;
-        ch = (struct sctp_chunk *)__skb_dequeue(&q->out);
-        if (ch)
+        if (!list_empty(&q->out_chunk_list)) {
+                struct list_head *entry = q->out_chunk_list.next;
+                ch = list_entry(entry, struct sctp_chunk, list);
+                list_del_init(entry);
                q->out_qlen -= ch->skb->len;
+        }
        return ch;
 }
 /* Add data chunk to the end of the queue. */
 static inline void sctp_outq_tail_data(struct sctp_outq *q,
                                       struct sctp_chunk *ch)
 {
-        __skb_queue_tail(&q->out, (struct sk_buff *)ch);
+        list_add_tail(&ch->list, &q->out_chunk_list);
        q->out_qlen += ch->skb->len;
        return;
 }
@@ -197,8 +202,8 @@ static inline int sctp_cacc_skip(struct sctp_transport *primary,
 void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q)
 {
        q->asoc = asoc;
-        skb_queue_head_init(&q->out);
+        INIT_LIST_HEAD(&q->out_chunk_list);
-        skb_queue_head_init(&q->control);
+        INIT_LIST_HEAD(&q->control_chunk_list);
        INIT_LIST_HEAD(&q->retransmit);
        INIT_LIST_HEAD(&q->sacked);
        INIT_LIST_HEAD(&q->abandoned);
@@ -217,7 +222,7 @@ void sctp_outq_teardown(struct sctp_outq *q)
 {
        struct sctp_transport *transport;
        struct list_head *lchunk, *pos, *temp;
-        struct sctp_chunk *chunk;
+        struct sctp_chunk *chunk, *tmp;
        /* Throw away unacknowledged chunks. */
        list_for_each(pos, &q->asoc->peer.transport_addr_list) {
@@ -269,8 +274,10 @@ void sctp_outq_teardown(struct sctp_outq *q)
        q->error = 0;
        /* Throw away any leftover control chunks. */
-        while ((chunk = (struct sctp_chunk *) skb_dequeue(&q->control)) != NULL)
+        list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
+                list_del_init(&chunk->list);
                sctp_chunk_free(chunk);
+        }
 }
 /* Free the outqueue structure and any related pending chunks.  */
@@ -333,7 +340,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
                        break;
                };
        } else {
-                __skb_queue_tail(&q->control, (struct sk_buff *) chunk);
+                list_add_tail(&chunk->list, &q->control_chunk_list);
                SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
        }
@@ -650,10 +657,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
        __u16 sport = asoc->base.bind_addr.port;
        __u16 dport = asoc->peer.port;
        __u32 vtag = asoc->peer.i.init_tag;
-        struct sk_buff_head *queue;
        struct sctp_transport *transport = NULL;
        struct sctp_transport *new_transport;
-        struct sctp_chunk *chunk;
+        struct sctp_chunk *chunk, *tmp;
        sctp_xmit_t status;
        int error = 0;
        int start_timer = 0;
@@ -675,8 +681,9 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
         *   ...
         */
-        queue = &q->control;
+        list_for_each_entry_safe(chunk, tmp, &q->control_chunk_list, list) {
-        while ((chunk = (struct sctp_chunk *)skb_dequeue(queue)) != NULL) {
+                list_del_init(&chunk->list);
                /* Pick the right transport to use. */
                new_transport = chunk->transport;
@@ -814,8 +821,6 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
                /* Finally, transmit new packets.  */
                start_timer = 0;
-                queue = &q->out;
                while ((chunk = sctp_outq_dequeue_data(q)) != NULL) {
                        /* RFC 2960 6.5 Every DATA chunk MUST carry a valid
                         * stream identifier.
@@ -1149,8 +1154,9 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack)
        /* See if all chunks are acked.
         * Make sure the empty queue handler will get run later.
         */
-        q->empty = skb_queue_empty(&q->out) && skb_queue_empty(&q->control) &&
+        q->empty = (list_empty(&q->out_chunk_list) &&
-                        list_empty(&q->retransmit);
+                    list_empty(&q->control_chunk_list) &&
+                    list_empty(&q->retransmit));
        if (!q->empty)
                goto finish;
@@ -1679,9 +1685,9 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
                if (TSN_lte(tsn, ctsn)) {
                        list_del_init(lchunk);
                        if (!chunk->tsn_gap_acked) {
-                        chunk->transport->flight_size -=
+                                chunk->transport->flight_size -=
-                                                 sctp_data_size(chunk);
+                                        sctp_data_size(chunk);
-                        q->outstanding_bytes -= sctp_data_size(chunk);
+                                q->outstanding_bytes -= sctp_data_size(chunk);
                        }
                        sctp_chunk_free(chunk);
                } else {
@@ -1729,7 +1735,7 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
                                              nskips, &ftsn_skip_arr[0]); 
        if (ftsn_chunk) {
-                __skb_queue_tail(&q->control, (struct sk_buff *)ftsn_chunk);
+                list_add_tail(&ftsn_chunk->list, &q->control_chunk_list);
                SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS);
        }
 }
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 5135e1a25d25..ce9245e71fca 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -219,7 +219,7 @@ static void sctp_free_local_addr_list(void)
 /* Copy the local addresses which are valid for 'scope' into 'bp'.  */
 int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope,
-                              int gfp, int copy_flags)
+                              unsigned int __nocast gfp, int copy_flags)
 {
        struct sctp_sockaddr_entry *addr;
        int error = 0;
@@ -1050,7 +1050,10 @@ SCTP_STATIC __init int sctp_init(void)
        sctp_sndbuf_policy              = 0;
        /* HB.interval              - 30 seconds */
-        sctp_hb_interval                = 30 * HZ;
+        sctp_hb_interval                = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
+        /* delayed SACK timeout */
+        sctp_sack_timeout               = SCTP_DEFAULT_TIMEOUT_SACK;
        /* Implementation specific variables. */
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 5baed9bb7de5..00d32b7c8266 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -78,7 +78,7 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
 static int sctp_process_param(struct sctp_association *asoc,
                              union sctp_params param,
                              const union sctp_addr *peer_addr,
-                              int gfp);
+                              unsigned int __nocast gfp);
 /* What was the inbound interface for this chunk? */
 int sctp_chunk_iif(const struct sctp_chunk *chunk)
@@ -174,7 +174,7 @@ void  sctp_init_cause(struct sctp_chunk *chunk, __u16 cause_code,
 */
 struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
                             const struct sctp_bind_addr *bp,
-                             int gfp, int vparam_len)
+                             unsigned int __nocast gfp, int vparam_len)
 {
        sctp_inithdr_t init;
        union sctp_params addrs;
@@ -261,7 +261,7 @@ nodata:
 struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
                                 const struct sctp_chunk *chunk,
-                                 int gfp, int unkparam_len)
+                                 unsigned int __nocast gfp, int unkparam_len)
 {
        sctp_inithdr_t initack;
        struct sctp_chunk *retval;
@@ -1003,6 +1003,7 @@ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
                SCTP_DEBUG_PRINTK("chunkifying skb %p w/o an sk\n", skb);
        }
+        INIT_LIST_HEAD(&retval->list);
        retval->skb             = skb;
        retval->asoc            = (struct sctp_association *)asoc;
        retval->resent          = 0;
@@ -1116,8 +1117,7 @@ static void sctp_chunk_destroy(struct sctp_chunk *chunk)
 /* Possibly, free the chunk.  */
 void sctp_chunk_free(struct sctp_chunk *chunk)
 {
-        /* Make sure that we are not on any list.  */
+        BUG_ON(!list_empty(&chunk->list));
-        skb_unlink((struct sk_buff *) chunk);
        list_del_init(&chunk->transmitted_list);
        /* Release our reference on the message tracker. */
@@ -1233,7 +1233,8 @@ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk)
 /* Create a CLOSED association to use with an incoming packet.  */
 struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
-                                        struct sctp_chunk *chunk, int gfp)
+                                        struct sctp_chunk *chunk,
+                                        unsigned int __nocast gfp)
 {
        struct sctp_association *asoc;
        struct sk_buff *skb;
@@ -1348,7 +1349,7 @@ nodata:
 struct sctp_association *sctp_unpack_cookie(
        const struct sctp_endpoint *ep,
        const struct sctp_association *asoc,
-        struct sctp_chunk *chunk, int gfp,
+        struct sctp_chunk *chunk, unsigned int __nocast gfp,
        int *error, struct sctp_chunk **errp)
 {
        struct sctp_association *retval = NULL;
@@ -1812,7 +1813,7 @@ int sctp_verify_init(const struct sctp_association *asoc,
 */
 int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
                      const union sctp_addr *peer_addr,
-                      sctp_init_chunk_t *peer_init, int gfp)
+                      sctp_init_chunk_t *peer_init, unsigned int __nocast gfp)
 {
        union sctp_params param;
        struct sctp_transport *transport;
@@ -1983,7 +1984,7 @@ nomem:
 static int sctp_process_param(struct sctp_association *asoc,
                              union sctp_params param,
                              const union sctp_addr *peer_addr,
-                              int gfp)
+                              unsigned int __nocast gfp)
 {
        union sctp_addr addr;
        int i;
@@ -2739,8 +2740,12 @@ int sctp_process_asconf_ack(struct sctp_association *asoc,
        asoc->addip_last_asconf = NULL;
        /* Send the next asconf chunk from the addip chunk queue. */
-        asconf = (struct sctp_chunk *)__skb_dequeue(&asoc->addip_chunks);
+        if (!list_empty(&asoc->addip_chunk_list)) {
-        if (asconf) {
+                struct list_head *entry = asoc->addip_chunk_list.next;
+                asconf = list_entry(entry, struct sctp_chunk, list);
+                list_del_init(entry);
                /* Hold the chunk until an ASCONF_ACK is received. */
                sctp_chunk_hold(asconf);
                if (sctp_primitive_ASCONF(asoc, asconf))
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 778639db125a..39c970b5b198 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -63,7 +63,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
                                void *event_arg,
                                sctp_disposition_t status,
                                sctp_cmd_seq_t *commands,
-                                int gfp);
+                                unsigned int __nocast gfp);
 static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
                             sctp_state_t state,
                             struct sctp_endpoint *ep,
@@ -71,7 +71,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
                             void *event_arg,
                             sctp_disposition_t status,
                             sctp_cmd_seq_t *commands,
-                             int gfp);
+                             unsigned int __nocast gfp);
 /********************************************************************
 * Helper functions
@@ -497,7 +497,8 @@ static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands,
 static int sctp_cmd_process_init(sctp_cmd_seq_t *commands,
                                 struct sctp_association *asoc,
                                 struct sctp_chunk *chunk,
-                                 sctp_init_chunk_t *peer_init, int gfp)
+                                 sctp_init_chunk_t *peer_init,
+                                 unsigned int __nocast gfp)
 {
        int error;
@@ -852,7 +853,7 @@ int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype,
               struct sctp_endpoint *ep,
               struct sctp_association *asoc,
               void *event_arg,
-               int gfp)
+               unsigned int __nocast gfp)
 {
        sctp_cmd_seq_t commands;
        const sctp_sm_table_entry_t *state_fn;
@@ -897,7 +898,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
                             void *event_arg,
                             sctp_disposition_t status,
                             sctp_cmd_seq_t *commands,
-                             int gfp)
+                             unsigned int __nocast gfp)
 {
        int error;
@@ -985,7 +986,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
                                void *event_arg,
                                sctp_disposition_t status,
                                sctp_cmd_seq_t *commands,
-                                int gfp)
+                                unsigned int __nocast gfp)
 {
        int error = 0;
        int force;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 058189684c7c..86073df418f5 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -92,6 +92,17 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep,
                                             sctp_cmd_seq_t *commands);
 static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
+static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
+                                           __u16 error,
+                                           const struct sctp_association *asoc,
+                                           struct sctp_transport *transport);
+static sctp_disposition_t sctp_sf_violation_chunklen(
+                                     const struct sctp_endpoint *ep,
+                                     const struct sctp_association *asoc,
+                                     const sctp_subtype_t type,
+                                     void *arg,
+                                     sctp_cmd_seq_t *commands);
 /* Small helper function that checks if the chunk length
 * is of the appropriate length.  The 'required_length' argument
@@ -2328,7 +2339,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep,
 *
 * This is common code called by several sctp_sf_*_abort() functions above.
 */
-sctp_disposition_t  sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
+static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
                                           __u16 error,
                                           const struct sctp_association *asoc,
                                           struct sctp_transport *transport)
@@ -3687,7 +3698,8 @@ sctp_disposition_t sctp_sf_violation(const struct sctp_endpoint *ep,
 *
 * Generate an  ABORT chunk and terminate the association.
 */
-sctp_disposition_t sctp_sf_violation_chunklen(const struct sctp_endpoint *ep,
+static sctp_disposition_t sctp_sf_violation_chunklen(
+                                     const struct sctp_endpoint *ep,
                                     const struct sctp_association *asoc,
                                     const sctp_subtype_t type,
                                     void *arg,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index aad55dc3792b..091a66f06a35 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -406,7 +406,7 @@ static int sctp_send_asconf(struct sctp_association *asoc,
         * transmission.
         */     
        if (asoc->addip_last_asconf) {
-                __skb_queue_tail(&asoc->addip_chunks, (struct sk_buff *)chunk);
+                list_add_tail(&chunk->list, &asoc->addip_chunk_list);
                goto out;       
        }
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
index e627d2b451b6..25037daf3fa0 100644
--- a/net/sctp/ssnmap.c
+++ b/net/sctp/ssnmap.c
@@ -57,7 +57,8 @@ static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
 /* Create a new sctp_ssnmap.
 * Allocate room to store at least 'len' contiguous TSNs.
 */
-struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out, int gfp)
+struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
+                                    unsigned int __nocast gfp)
 {
        struct sctp_ssnmap *retval;
        int size;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 7fc31849312b..dc4893474f18 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -47,6 +47,8 @@
 static ctl_handler sctp_sysctl_jiffies_ms;
 static long rto_timer_min = 1;
 static long rto_timer_max = 86400000; /* One day */
+static long sack_timer_min = 1;
+static long sack_timer_max = 500;
 static ctl_table sctp_table[] = {
        {
@@ -187,6 +189,17 @@ static ctl_table sctp_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_dointvec
        },
+        {
+                .ctl_name       = NET_SCTP_SACK_TIMEOUT,
+                .procname       = "sack_timeout",
+                .data           = &sctp_sack_timeout,
+                .maxlen         = sizeof(long),
+                .mode           = 0644,
+                .proc_handler   = &proc_doulongvec_ms_jiffies_minmax,
+                .strategy       = &sctp_sysctl_jiffies_ms,
+                .extra1         = &sack_timer_min,
+                .extra2         = &sack_timer_max,
+        },
        { .ctl_name = 0 }
 };
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 0ec0fde6e6c5..d2f04ebe5081 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -57,7 +57,7 @@
 /* Initialize a new transport from provided memory.  */
 static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
                                                  const union sctp_addr *addr,
-                                                  int gfp)
+                                                  unsigned int __nocast gfp)
 {
        /* Copy in the address.  */
        peer->ipaddr = *addr;
@@ -103,7 +103,6 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
        /* Set up the heartbeat timer. */
        init_timer(&peer->hb_timer);
-        peer->hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
        peer->hb_timer.function = sctp_generate_heartbeat_event;
        peer->hb_timer.data = (unsigned long)peer;
@@ -122,7 +121,8 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 }
 /* Allocate and initialize a new transport.  */
-struct sctp_transport *sctp_transport_new(const union sctp_addr *addr, int gfp)
+struct sctp_transport *sctp_transport_new(const union sctp_addr *addr,
+                                          unsigned int __nocast gfp)
 {
        struct sctp_transport *transport;
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 17d0ff534735..0abd5101107c 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -74,7 +74,7 @@ SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags)
 /* Create a new sctp_ulpevent.  */
 SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags,
-                                                    int gfp)
+                                                    unsigned int __nocast gfp)
 {
        struct sctp_ulpevent *event;
        struct sk_buff *skb;
@@ -136,7 +136,7 @@ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event)
 struct sctp_ulpevent  *sctp_ulpevent_make_assoc_change(
        const struct sctp_association *asoc,
        __u16 flags, __u16 state, __u16 error, __u16 outbound,
-        __u16 inbound, int gfp)
+        __u16 inbound, unsigned int __nocast gfp)
 {
        struct sctp_ulpevent *event;
        struct sctp_assoc_change *sac;
@@ -237,7 +237,7 @@ fail:
 struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
        const struct sctp_association *asoc,
        const struct sockaddr_storage *aaddr,
-        int flags, int state, int error, int gfp)
+        int flags, int state, int error, unsigned int __nocast gfp)
 {
        struct sctp_ulpevent *event;
        struct sctp_paddr_change  *spc;
@@ -350,7 +350,7 @@ fail:
 */
 struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
        const struct sctp_association *asoc, struct sctp_chunk *chunk,
-        __u16 flags, int gfp)
+        __u16 flags, unsigned int __nocast gfp)
 {
        struct sctp_ulpevent *event;
        struct sctp_remote_error *sre;
@@ -448,7 +448,7 @@ fail:
 */
 struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
        const struct sctp_association *asoc, struct sctp_chunk *chunk,
-        __u16 flags, __u32 error, int gfp)
+        __u16 flags, __u32 error, unsigned int __nocast gfp)
 {
        struct sctp_ulpevent *event;
        struct sctp_send_failed *ssf;
@@ -557,7 +557,7 @@ fail:
 */
 struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
        const struct sctp_association *asoc,
-        __u16 flags, int gfp)
+        __u16 flags, unsigned int __nocast gfp)
 {
        struct sctp_ulpevent *event;
        struct sctp_shutdown_event *sse;
@@ -620,7 +620,7 @@ fail:
 * 5.3.1.6 SCTP_ADAPTION_INDICATION
 */
 struct sctp_ulpevent *sctp_ulpevent_make_adaption_indication(
-        const struct sctp_association *asoc, int gfp)
+        const struct sctp_association *asoc, unsigned int __nocast gfp)
 {
        struct sctp_ulpevent *event;
        struct sctp_adaption_event *sai;
@@ -657,7 +657,7 @@ fail:
 */
 struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
                                                struct sctp_chunk *chunk,
-                                                int gfp)
+                                                unsigned int __nocast gfp)
 {
        struct sctp_ulpevent *event = NULL;
        struct sk_buff *skb;
@@ -718,7 +718,8 @@ fail:
 *   various events.
 */
 struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
-        const struct sctp_association *asoc, __u32 indication, int gfp)
+        const struct sctp_association *asoc, __u32 indication,
+        unsigned int __nocast gfp)
 {
        struct sctp_ulpevent *event;
        struct sctp_pdapi_event *pd;
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index d5dd2cf7ac4a..8bbc279d6c99 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -100,7 +100,7 @@ void sctp_ulpq_free(struct sctp_ulpq *ulpq)
 /* Process an incoming DATA chunk.  */
 int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
-                        int gfp)
+                        unsigned int __nocast gfp)
 {
        struct sk_buff_head temp;
        sctp_data_chunk_t *hdr;
@@ -778,7 +778,8 @@ static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed)
 /* Partial deliver the first message as there is pressure on rwnd. */
 void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
-                                struct sctp_chunk *chunk, int gfp)
+                                struct sctp_chunk *chunk,
+                                unsigned int __nocast gfp)
 {
        struct sctp_ulpevent *event;
        struct sctp_association *asoc;
@@ -802,7 +803,7 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
 /* Renege some packets to make room for an incoming chunk.  */
 void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
-                      int gfp)
+                      unsigned int __nocast gfp)
 {
        struct sctp_association *asoc;
        __u16 needed, freed;
@@ -841,7 +842,7 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
 /* Notify the application if an association is aborted and in
 * partial delivery mode.  Send up any pending received messages.
 */
-void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, int gfp)
+void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, unsigned int __nocast gfp)
 {
        struct sctp_ulpevent *ev = NULL;
        struct sock *sk;
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 32e8acbc60fe..62a073495276 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -41,6 +41,7 @@ EXPORT_SYMBOL(rpc_release_task);
 /* RPC client functions */
 EXPORT_SYMBOL(rpc_create_client);
+EXPORT_SYMBOL(rpc_new_client);
 EXPORT_SYMBOL(rpc_clone_client);
 EXPORT_SYMBOL(rpc_bind_new_program);
 EXPORT_SYMBOL(rpc_destroy_client);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 05907035bc96..56db8f13e6cb 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1185,8 +1185,8 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
        arg->page_len = (pages-2)*PAGE_SIZE;
        arg->len = (pages-1)*PAGE_SIZE;
        arg->tail[0].iov_len = 0;
-        
-        try_to_freeze(PF_FREEZE);
+        try_to_freeze();
        if (signalled())
                return -EINTR;
@@ -1227,7 +1227,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
                schedule_timeout(timeout);
-                try_to_freeze(PF_FREEZE);
+                try_to_freeze();
                spin_lock_bh(&serv->sv_lock);
                remove_wait_queue(&rqstp->rq_wait, &wait);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index eca92405948f..3c654e06b084 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -145,8 +145,6 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task)
        if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) {
                if (task == xprt->snd_task)
                        return 1;
-                if (task == NULL)
-                        return 0;
                goto out_sleep;
        }
        if (xprt->nocong || __xprt_get_cong(xprt, task)) {
@@ -970,7 +968,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
                goto out;
        }
-        dprintk("RPC:      XID %08x read %u bytes\n",
+        dprintk("RPC:      XID %08x read %Zd bytes\n",
                        ntohl(xprt->tcp_xid), r);
        dprintk("RPC:      xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
                        xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
@@ -1006,7 +1004,7 @@ tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
        desc->count -= len;
        desc->offset += len;
        xprt->tcp_offset += len;
-        dprintk("RPC:      discarded %u bytes\n", len);
+        dprintk("RPC:      discarded %Zu bytes\n", len);
        tcp_check_recm(xprt);
 }
diff --git a/net/unix/Kconfig b/net/unix/Kconfig
new file mode 100644
index 000000000000..5a69733bcdad
--- /dev/null
+++ b/net/unix/Kconfig
@@ -0,0 +1,21 @@
+#
+# Unix Domain Sockets
+#
+config UNIX
+        tristate "Unix domain sockets"
+        ---help---
+          If you say Y here, you will include support for Unix domain sockets;
+          sockets are the standard Unix mechanism for establishing and
+          accessing network connections.  Many commonly used programs such as
+          the X Window system and syslog use these sockets even if your
+          machine is not connected to any network.  Unless you are working on
+          an embedded system or something similar, you therefore definitely
+          want to say Y here.
+          To compile this driver as a module, choose M here: the module will be
+          called unix.  Note that several important services won't work
+          correctly if you say M here and then neglect to load the module.
+          Say Y unless you know what you are doing.
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c420eba4876b..d403e34088ad 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -302,7 +302,7 @@ static void unix_write_space(struct sock *sk)
 * may receive messages only from that peer. */
 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 {
-        if (skb_queue_len(&sk->sk_receive_queue)) {
+        if (!skb_queue_empty(&sk->sk_receive_queue)) {
                skb_queue_purge(&sk->sk_receive_queue);
                wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
@@ -1619,7 +1619,7 @@ static long unix_stream_data_wait(struct sock * sk, long timeo)
        for (;;) {
                prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
-                if (skb_queue_len(&sk->sk_receive_queue) ||
+                if (!skb_queue_empty(&sk->sk_receive_queue) ||
                    sk->sk_err ||
                    (sk->sk_shutdown & RCV_SHUTDOWN) ||
                    signal_pending(current) ||
diff --git a/net/wanrouter/Kconfig b/net/wanrouter/Kconfig
new file mode 100644
index 000000000000..1debe1cb054e
--- /dev/null
+++ b/net/wanrouter/Kconfig
@@ -0,0 +1,29 @@
+#
+# Configuration for WAN router
+#
+config WAN_ROUTER
+        tristate "WAN router"
+        depends on EXPERIMENTAL
+        ---help---
+          Wide Area Networks (WANs), such as X.25, frame relay and leased
+          lines, are used to interconnect Local Area Networks (LANs) over vast
+          distances with data transfer rates significantly higher than those
+          achievable with commonly used asynchronous modem connections.
+          Usually, a quite expensive external device called a `WAN router' is
+          needed to connect to a WAN.
+          As an alternative, WAN routing can be built into the Linux kernel.
+          With relatively inexpensive WAN interface cards available on the
+          market, a perfectly usable router can be built for less than half
+          the price of an external router.  If you have one of those cards and
+          wish to use your Linux box as a WAN router, say Y here and also to
+          the WAN driver for your card, below.  You will then need the
+          wan-tools package which is available from <ftp://ftp.sangoma.com/>.
+          Read <file:Documentation/networking/wan-router.txt> for more
+          information.
+          To compile WAN routing support as a module, choose M here: the
+          module will be called wanrouter.
+          If unsure, say N.
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index d6844ac226f5..13b650ad22e2 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -358,10 +358,10 @@ int wanrouter_encapsulate(struct sk_buff *skb, struct net_device *dev,
 */
-unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
        int cnt = skb->data[0] ? 0 : 1; /* there may be a pad present */
-        unsigned short ethertype;
+        __be16 ethertype;
        switch (skb->data[cnt]) {
        case NLPID_IP:          /* IP datagramm */
@@ -379,7 +379,7 @@ unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
                                skb->data[cnt+3], dev->name);
                        return 0;
                }
-                ethertype = *((unsigned short*)&skb->data[cnt+4]);
+                ethertype = *((__be16*)&skb->data[cnt+4]);
                cnt += 6;
                break;
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
new file mode 100644
index 000000000000..e6759c9660bb
--- /dev/null
+++ b/net/x25/Kconfig
@@ -0,0 +1,36 @@
+#
+# CCITT X.25 Packet Layer
+#
+config X25
+        tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)"
+        depends on EXPERIMENTAL
+        ---help---
+          X.25 is a set of standardized network protocols, similar in scope to
+          frame relay; the one physical line from your box to the X.25 network
+          entry point can carry several logical point-to-point connections
+          (called "virtual circuits") to other computers connected to the X.25
+          network. Governments, banks, and other organizations tend to use it
+          to connect to each other or to form Wide Area Networks (WANs). Many
+          countries have public X.25 networks. X.25 consists of two
+          protocols: the higher level Packet Layer Protocol (PLP) (say Y here
+          if you want that) and the lower level data link layer protocol LAPB
+          (say Y to "LAPB Data Link Driver" below if you want that).
+          You can read more about X.25 at <http://www.sangoma.com/x25.htm> and
+          <http://www.cisco.com/univercd/cc/td/doc/product/software/ios11/cbook/cx25.htm>.
+          Information about X.25 for Linux is contained in the files
+          <file:Documentation/networking/x25.txt> and
+          <file:Documentation/networking/x25-iface.txt>.
+          One connects to an X.25 network either with a dedicated network card
+          using the X.21 protocol (not yet supported by Linux) or one can do
+          X.25 over a standard telephone line using an ordinary modem (say Y
+          to "X.25 async driver" below) or over Ethernet using an ordinary
+          Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link
+          Driver" and "LAPB over Ethernet driver" below).
+          To compile this driver as a module, choose M here: the module
+          will be called x25. If unsure, say N.
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 58ca6a972c48..0c1c04322baf 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -1,6 +1,10 @@
 #
 # XFRM configuration
 #
+config XFRM
+       bool
+       depends on NET
 config XFRM_USER
        tristate "IPsec user configuration interface"
        depends on INET && XFRM
@@ -10,3 +14,14 @@ config XFRM_USER
          If unsure, say Y.
+config NET_KEY
+        tristate "PF_KEY sockets"
+        select XFRM
+        ---help---
+          PF_KEYv2 socket family, compatible to KAME ones.
+          They are required if you are going to use IPsec tools ported
+          from KAME.
+          Say Y unless you know what you are doing.
author	Dave Kleikamp <shaggy@austin.ibm.com>	2005-07-13 09:57:38 -0400
committer	Dave Kleikamp <shaggy@austin.ibm.com>	2005-07-13 09:57:38 -0400
commit	f7f24758ac98a506770bc5910d33567610fa3403 (patch)
tree	ff7fad3d01bf9dc2e2e54b908f9fca4891e1ee72 /net
parent	b38a3ab3d1bb0dc3288f73903d4dc4672b5cd2d0 (diff)
parent	c32511e2718618f0b53479eb36e07439aa363a74 (diff)